Random Statistics Topics

Paired T-Test

Field <- c(1,2,3,4,5,6,7)
Fum_A <-c(77,40,11,31,28,50,53)
Fum_B <- c(76,38,10,29,27,48,51)
data <- data.frame(Field,Fum_A,Fum_B)
data

mean_diff <-mean(data$Fum_A) - mean(data$Fum_B)
mean_diff

## [1] 1.571429

sd_diff <- sd(data$Fum_A)-sd(data$Fum_B)
sd_diff

## [1] 0.05855683

t_calc <- mean_diff/(sd_diff/sqrt(7))
t_calc

## [1] 71.00127

t.test(Fum_A,Fum_B, paired = T)

## 
##  Paired t-test
## 
## data:  Fum_A and Fum_B
## t = 7.7782, df = 6, p-value = 0.0002377
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.077078 2.065779
## sample estimates:
## mean of the differences 
##                1.571429

Paired T-Test

Cola_A <- c(48,45,50,41,55,57,48,47,50,52)
Cola_B <- c(42,40,44,41,50,54,56,50,43,45)

Two samples are paired since the same person tasted both colas.

paired_ttest <- t.test(Cola_A,Cola_B, paired = TRUE)
paired_ttest

## 
##  Paired t-test
## 
## data:  Cola_A and Cola_B
## t = 1.7764, df = 9, p-value = 0.1094
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.7656394  6.3656394
## sample estimates:
## mean of the differences 
##                     2.8

p-value>0.05, fail to reject H0. We can conclude the mean taste scores of the two colas are not significantly different.

Assumptions normality

par(mfrow = c(1,2))
qqnorm(Cola_A, main = "QQ Plot Cola A")
qqline(Cola_A)
qqnorm(Cola_B, main = "QQ Plot Cola B")
qqline(Cola_B)

Non-parametric procedure:
Seven participants feel Cola A tastes better, so the sample proportion is 7/10 = 0.7

95% Confidence interval for the proportion p is 0.7 ± 1.96(√0.7*0.3/10) = (0.42, 0.98). Or you can use other more advanced formulas.
No, cannot conclude more than 50% of people like Cola A better as the interval is not strictly above .50.

Interaction Plot

life<-c(8.6,7.9,5.4,8.2,8.4,5.7,9.4,8.5,5.8,8.8,8.9,5.9)
brand<-rep(c("A","B"),each=6)
brand

##  [1] "A" "A" "A" "A" "A" "A" "B" "B" "B" "B" "B" "B"

device<- rep(c("Radio","Camera","DVD"),4)
device

##  [1] "Radio"  "Camera" "DVD"    "Radio"  "Camera" "DVD"    "Radio"  "Camera"
##  [9] "DVD"    "Radio"  "Camera" "DVD"

data<-data.frame(life,brand,device)
data

interaction.plot(data$brand,data$device,data$life,xlab = "Brand", ylab = "life",main = "Interaction plot", ylim = c(1,10), trace.label = "Device", type = "b", col=c("red","green","blue"), pch = c(19,17), fixed = TRUE)

#using ggplot 
library(ggplot2)
interaction_gg <- ggplot(data, aes(brand, life, color = device)) + geom_point() + stat_summary(fun.y = mean, geom = "line", aes(group = device))+ theme_bw() + labs( title = "Interaction plot of brand and device")

## Warning: `fun.y` is deprecated. Use `fun` instead.

interaction_gg

No interaction, since lines are parallel.

#anova to see interaction
model<-aov(life~brand*device,data = data) 
anova(model)

Fail to reject null hypothesis since p-value >0.05. No interaction.

library(emmeans)
#drop the insignificant interaction term
model2<-aov(life~brand+device,data = data)
lsm_bat<-lsmeans(model2,~brand) 
ct<- summary(contrast(lsm_bat,list("A-B"=c(1,-1))),infer=c(T,T),level=0.95,side ="two-sided") 
ct

Since the model does not involve interaction eﬀect, we recommend Brand B which has a longer battery life than Brand A with 95% CI of thediﬀerence of main eﬀects (-0.88,-0.153).

fit <- lm(life ~ device, data = data)  # Fit the model
summary(fit)

## 
## Call:
## lm(formula = life ~ device, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5500 -0.1875  0.0250  0.1250  0.6500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.4250     0.1970  42.761 1.04e-11 ***
## deviceDVD    -2.7250     0.2786  -9.780 4.31e-06 ***
## deviceRadio   0.3250     0.2786   1.166    0.273    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3941 on 9 degrees of freedom
## Multiple R-squared:  0.9414, Adjusted R-squared:  0.9284 
## F-statistic: 72.27 on 2 and 9 DF,  p-value: 2.858e-06

par(mfrow = c(2, 2))  # Split the plotting panel into a 2 x 2 grid
plot(fit)  # Plot the model information

Logistic Regression

flight_data <- read.csv("flightdata.csv")
head(flight_data)

str(flight_data)

## 'data.frame':    5196 obs. of  6 variables:
##  $ origin  : Factor w/ 1 level "JFK": 1 1 1 1 1 1 1 1 1 1 ...
##  $ dest    : Factor w/ 46 levels "ABQ","ATL","AUS",..: 30 5 5 27 21 40 13 2 46 34 ...
##  $ delay   : int  1 1 0 0 0 0 0 0 0 1 ...
##  $ carrier : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ air_time: num  3.25 3.1 3.15 2.25 2.23 ...
##  $ weekend : int  1 1 1 1 1 1 1 1 1 1 ...

logistic_reg <- glm(delay ~ air_time + carrier + weekend, family = binomial, data = flight_data)
summary(logistic_reg)

## 
## Call:
## glm(formula = delay ~ air_time + carrier + weekend, family = binomial, 
##     data = flight_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3225  -1.1837  -0.9884   1.1706   1.3798  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.013021   0.060516   0.215    0.830    
## air_time     0.001215   0.016762   0.072    0.942    
## carrier     -0.477806   0.064082  -7.456 8.91e-14 ***
## weekend      0.313711   0.061875   5.070 3.98e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7200.5  on 5195  degrees of freedom
## Residual deviance: 7112.0  on 5192  degrees of freedom
## AIC: 7120
## 
## Number of Fisher Scoring iterations: 4

\[log(\hat{p}/(1−\hat{p}) = 0.013 + 0.001(air.time)−0.478(carrier) + 0.314(weekend)\]

Since the coeﬃcient for carrier is negative, Delta ﬂights have a lower estimated probability of being delayed than JetBlue ﬂights.

Since the coeﬃcient for weekend is positive, ﬂights departing during the weekend (Sat or Sun) have a higher estimated probability of being delayed than ﬂights departing during the week (Mon-Fri).

The coeﬃcient for air time is not signiﬁcantly diﬀerent than zero (p-value = 0.94); this suggests that air time is not a useful predictor of ﬂight delays and should be removed from the model.

#remove air_time
logistic_reg1 <- glm(delay ~ carrier + weekend, family = binomial, data = flight_data)
summary(logistic_reg1)

## 
## Call:
## glm(formula = delay ~ carrier + weekend, family = binomial, data = flight_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3204  -1.1844  -0.9894   1.1704   1.3779  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.01643    0.03805   0.432    0.666    
## carrier     -0.47622    0.06024  -7.905 2.68e-15 ***
## weekend      0.31368    0.06187   5.070 3.99e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7200.5  on 5195  degrees of freedom
## Residual deviance: 7112.0  on 5193  degrees of freedom
## AIC: 7118
## 
## Number of Fisher Scoring iterations: 4

\[log(\hat{p}/(1−\hat{p}) = 0.013 −0.476(carrier) + 0.314(weekend)\]

The estimated probability that a Delta ﬂight departing JFK during the weekend is delayed

new_data <- data.frame(carrier = 1, weekend = 1) 
new_data

predict(logistic_reg1, newdata=new_data, type="response")

##         1 
## 0.4635359

The estimated probability that a JetBlue ﬂight departing JFK during the weekend is delayed:

new_x <- data.frame(carrier = 0, weekend = 1)
predict(logistic_reg1, newdata=new_x, type="response")

##         1 
## 0.5817854

Suppose a colleague from New York City is making plans to visit you in San Francisco in December. According to the logistic regression model in (c), would you recommend that your colleague ﬂy with Delta or JetBlue in order to avoid delays? If your colleague plans to land at San Francisco International Airport (SFO), describe how you could modify the model in (c) to account for this ﬂight destination

Delta because of less probability of getting delayed.

n <-nrow(flight_data)
flight_data$SFO <- rep(0,n)
flight_data$SFO[flight_data$dest == "SFO"] <- 1

logistic_reg3 <- glm(delay ~ carrier + weekend + SFO, family = binomial, data = flight_data) 
summary(logistic_reg3)

## 
## Call:
## glm(formula = delay ~ carrier + weekend + SFO, family = binomial, 
##     data = flight_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3238  -1.1880  -0.8713   1.1669   1.5182  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.02480    0.03820   0.649    0.516    
## carrier     -0.45080    0.06104  -7.385 1.52e-13 ***
## weekend      0.31295    0.06191   5.055 4.31e-07 ***
## SFO         -0.34678    0.13645  -2.541    0.011 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7200.5  on 5195  degrees of freedom
## Residual deviance: 7105.5  on 5192  degrees of freedom
## AIC: 7113.5
## 
## Number of Fisher Scoring iterations: 4

GSS data

gss <- read.csv("gssdata.csv ")
head(gss)

dim(gss)

## [1] 1487    2

#contingency table
addmargins(table(gss)) #add margins for sum

##         spend_educ
## gender   About right Too little Too much  Sum
##   Female         142        658       42  842
##   Male           139        453       53  645
##   Sum            281       1111       95 1487

658/1111 # too little on educ, female

## [1] 0.5922592

42/95 # too much on educ, female

## [1] 0.4421053

Chi-square test of independence:

Null Hypothesis, H0: The variables gender and spen_educ are independent. HA: The variables are dependent

test1 <- chisq.test(gss$gender, gss$spend_educ)
test1

## 
##  Pearson's Chi-squared test
## 
## data:  gss$gender and gss$spend_educ
## X-squared = 13.266, df = 2, p-value = 0.001316

test1$p.value

## [1] 0.001316225

test1$expected

##           gss$spend_educ
## gss$gender About right Too little Too much
##     Female    159.1137   629.0935 53.79287
##     Male      121.8863   481.9065 41.20713

Since p-value is less than 0.01, we reject null hypothesis. The data provide evidence that the gender and spend_edu are dependent.

We may be committing type I error. Rejecting HO when it’s true, meaning in context above the gender and spend_edu are dependent, when they actually are dependent.

Quakes data

head(quakes)

pl <-plot(quakes$depth,quakes$mag, data = quakes)

## Warning in plot.window(...): "data" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter

## Warning in box(...): "data" is not a graphical parameter

## Warning in title(...): "data" is not a graphical parameter

library(tidyverse)

## -- Attaching packages --------------------------- tidyverse 1.3.0 --

## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.3

## -- Conflicts ------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(maps)

## 
## Attaching package: 'maps'

## The following object is masked from 'package:purrr':
## 
##     map

ggplot(quakes, aes( x = depth, y = mag)) + geom_point() + geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

No linear relation- it looks flat.

Using dplyer:

quakes_greater <- quakes %>% filter(depth>400)
quakes_greater %>% ggplot(aes(y=mag, x= depth))+geom_point()+ geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

quakes_smaller <- quakes %>% filter(depth<400)
quakes_smaller %>% ggplot(aes(y=mag, x= depth))+geom_point()+ geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

model <- lm(mag~depth, data = quakes)
summary(model)

## 
## Call:
## lm(formula = mag ~ depth, data = quakes)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72012 -0.29642 -0.03694  0.19818  1.70014 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.755e+00  2.179e-02 218.168  < 2e-16 ***
## depth       -4.310e-04  5.756e-05  -7.488 1.54e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3921 on 998 degrees of freedom
## Multiple R-squared:  0.05319,    Adjusted R-squared:  0.05225 
## F-statistic: 56.07 on 1 and 998 DF,  p-value: 1.535e-13

Statistically significant.

plot(quakes$long, quakes$lat, data = quakes)

## Warning in plot.window(...): "data" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter

## Warning in box(...): "data" is not a graphical parameter

## Warning in title(...): "data" is not a graphical parameter

quakes %>% ggplot(aes(y = lat, x= long)) + geom_point()

Map

head(map_data("world"))

map_data("world" ) %>% ggplot(aes(x=long, y = lat)) + geom_path(aes(group=group),color = "red", size = 0.1)

long_lat <- quakes %>% select(lat, long) %>% apply(MARGIN = 2, range) %>% as.data.frame()
head(long_lat)

base_map <- map_data("world") %>% ggplot(aes(x=long, y=lat)) + geom_path(aes(group = group), color = "green", size = 0.1) + lims(x =long_lat$long, y = long_lat$lat)
base_map

## Warning: Removed 98515 row(s) containing missing values (geom_path).

base_map + geom_point(data = quakes, aes(color = mag, alpha = 0.01, size = mag)) + theme(legend.position = "none")

## Warning: Removed 98515 row(s) containing missing values (geom_path).

Gasoline data - ANOVA

gas_add <- read.csv('gasoline_additives.csv')
gas_add

One way ANOVA: Null Hypothesis : all means are equal Alternative Hypothesis : Not all means are equal alpha = 0.05

car_group <- rep(c("W","X","Y","Z"), each=6)
mileage <- c(gas_add$W, gas_add$X, gas_add$Y, gas_add$Z)
gas_add1 <- data.frame(car_group, mileage)
gas_add1

plot(mileage~car_group, data = gas_add1)

anova_model <- aov(mileage~car_group, data = gas_add1)
summary(anova_model)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## car_group    3 213.88   71.29   127.2 3.39e-13 ***
## Residuals   20  11.21    0.56                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p-value < 0.05, reject H0. at least one population means are different from the other group means.

Assumptions - normality,independence, equal variance

par(mfrow = c(2,2))
plot(anova_model)

attributes(anova_model)

## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "contrasts"     "xlevels"       "call"          "terms"        
## [13] "model"        
## 
## $class
## [1] "aov" "lm"

Interval Plot

library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

plotmeans(mileage~car_group, data = gas_add1, mean.labels = T, main = "Interval plot of W,X,Y and Z")

Tukey Simultaneous Tests for Differences of Means

tukey.test <- TukeyHSD(anova_model)
tukey.test

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = mileage ~ car_group, data = gas_add1)
## 
## $car_group
##           diff        lwr       upr     p adj
## X-W -3.4000000 -4.6097284 -2.190272 0.0000009
## Y-W  3.5666667  2.3569382  4.776395 0.0000004
## Z-W  3.9500000  2.7402716  5.159728 0.0000001
## Y-X  6.9666667  5.7569382  8.176395 0.0000000
## Z-X  7.3500000  6.1402716  8.559728 0.0000000
## Z-Y  0.3833333 -0.8263951  1.593062 0.8116208

plot(tukey.test)

if interval does not contain zero, the corresponding means are significantly different.

Alternative - Grouping Information Using the Tukey Method and 95% Confidence

library(agricolae)
tukey.test2 <- HSD.test(anova_model, trt = "car_group")
tukey.test2

## $statistics
##     MSerror Df     Mean       CV      MSD
##   0.5604167 20 32.39583 2.310821 1.209728
## 
## $parameters
##    test    name.t ntr StudentizedRange alpha
##   Tukey car_group   4         3.958293  0.05
## 
## $means
##    mileage       std r  Min  Max    Q25   Q50    Q75
## W 31.36667 0.8824209 6 30.1 32.6 30.900 31.35 31.875
## X 27.96667 0.5501515 6 27.4 28.7 27.525 27.85 28.400
## Y 34.93333 0.8869423 6 33.9 36.1 34.200 34.95 35.550
## Z 35.31667 0.6112828 6 34.5 36.2 34.975 35.25 35.675
## 
## $comparison
## NULL
## 
## $groups
##    mileage groups
## Z 35.31667      a
## Y 34.93333      a
## W 31.36667      b
## X 27.96667      c
## 
## attr(,"class")
## [1] "group"

Means that do not share a letter are significantly different. recommend additives Y or Z.

Preservative Problem

Researchers from the Department of Fruit Crops at a university compared two different preservatives to be used in freezing strawberries. A random sample of 10 bags of strawberries was randomly divided into 2 groups. The bags in Group I served as control group (current preservative), while those in Group II were assigned to a newly developed preservative. After all15 bags of strawberries were prepared, they were stored at 0o C for 6 months. At the end of this time, the contents of each bag were allowed to thaw and then rated on a scale of 1 to 10 points for discoloration (a low score means little discoloration.

scores<-c(3.70,3.92,3.04,3.71,3.41,3.22,3.50,3.79,1.75,2.06,1.90,2.57,2.20,1.50,
1.89,1.52)
preservative_group <- rep(c("I","II"), each = 8)
df <- data.frame(preservative_group, scores)
df

boxplot(scores~preservative_group)

As can been seen in the box plots, the center of the group I is the higher than that of group II and their spread levels are similar. In addition, the two box plots are separate, do not overlap at all. The difference seems significant.

Let mu be the mean discoloration scale and index 1, 2 are for Group I, II, respectively. Ho: mu1 = mu2 vs. Ha: mu1 > mu 2

test_ind <-t.test(scores~preservative_group, mu = 0, alternative = "greater", conf = 0.95,var.eq = T)
test_ind

## 
##  Two Sample t-test
## 
## data:  scores by preservative_group
## t = 9.7902, df = 14, p-value = 6.07e-08
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  1.322401      Inf
## sample estimates:
##  mean in group I mean in group II 
##          3.53625          1.92375

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

#to see if population variance are equal
leveneTest(scores~preservative_group)

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

p-value > 0.05, fail to reject HO. Pop variance are are equal.

Non-parametric test

Let m be the median discoloration scale and index 1, 2 are for Group I, II, respectively. Ho: m1 = m2 vs. Ha: m1 is not equal to m 2

wilcox.test(scores~preservative_group)

## 
##  Wilcoxon rank sum test
## 
## data:  scores by preservative_group
## W = 64, p-value = 0.0001554
## alternative hypothesis: true location shift is not equal to 0

Again, because the small p value, we can conclude the two preservatives have different distributions of preservative discoloration.

What are the assumptions of your test in (b)? Assess these assumptions by graphs and tests. Be sure to include test statistics, p values and test conclusions.

#check normality of discolaration scales
two_groups <- split(scores,preservative_group)
two_groups

## $I
## [1] 3.70 3.92 3.04 3.71 3.41 3.22 3.50 3.79
## 
## $II
## [1] 1.75 2.06 1.90 2.57 2.20 1.50 1.89 1.52

par(mfrow = c(1,2))
qqnorm(two_groups$I, main = "QQ plot for group I")
qqline(two_groups$I)
qqnorm(two_groups$II, main = "QQ plot for group II")
qqline(two_groups$II)

#Shapiro-Wilk Test for normality 
shapiro.test(two_groups$I)

## 
##  Shapiro-Wilk normality test
## 
## data:  two_groups$I
## W = 0.95394, p-value = 0.7508

shapiro.test(two_groups$II)

## 
##  Shapiro-Wilk normality test
## 
## data:  two_groups$II
## W = 0.94875, p-value = 0.6986

As the p values .7508 and .6986 are both large and the normal Q-Q plots are both of linear trend, it is reasonable to assume normality of discoloration for both groups.

#Equal variance
var.test(scores~preservative_group)

## 
##  F test to compare two variances
## 
## data:  scores by preservative_group
## F = 0.71274, num df = 7, denom df = 7, p-value = 0.6662
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.1426924 3.5600492
## sample estimates:
## ratio of variances 
##          0.7127355

As seen in the box plots in (a), the spread level of residuals stay almost the same for the two groups. In addition, the p value of F test is .6662, very large. Therefore, we can assume equal variances of discoloration scales of the two groups.

Combination and Permutation

n<- 10
r <- 2
#combination 
choose(n,r)

## [1] 45

#permutation
choose(n,r) * factorial(r)

## [1] 90

Binomial Distribution

# Cumulative Binomial 
n<-12 #size
p<-0.4 #prob of success
pbinom(5, 12, 0.4) # cumulative binomial dist, at most 5, P(X<=5)

## [1] 0.6652086

dbinom(7, 12, 0.4) # prob P(x=7)

## [1] 0.1009024

Exponential Distribution

pexp(31,1/44) #cumulative, p(x<=31), rate lamda = 1/44, theta(mean) = 44

## [1] 0.5056668

Two Way ANOVA

ob_prob1 <- read.csv("ob_prob1.csv")
ob_prob1

machine <- rep(c("A","B","C","D", "E"), each= 16 )
machine

##  [1] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "B"
## [20] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "C" "C" "C" "C" "C" "C"
## [39] "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "D" "D" "D" "D" "D" "D" "D" "D" "D"
## [58] "D" "D" "D" "D" "D" "D" "D" "E" "E" "E" "E" "E" "E" "E" "E" "E" "E" "E" "E"
## [77] "E" "E" "E" "E"

machine_head <- rep(c("1","2","3", "4"), each=4)
machine_head

##  [1] "1" "1" "1" "1" "2" "2" "2" "2" "3" "3" "3" "3" "4" "4" "4" "4"

stress_data <- c(6,13,2,7,2,    3,  10, 4,1,    9,  5,  7,8,    8,  6,  9,10,2,4,0,9,1,1,3,7,   1,  7,  4, 12,  10, 9,  1,1,10,8,7,1,11,5,2,5,6,0,5, 5,7,7,4,11,5,1,3,1,10,8,8,6,8,9,6,4,3,4,5,2,6,3,3,4,6,7,1, 7,1,4,2, 9,3,1,2    )

df1 <- data.frame(machine_head,machine, stress_data)
df1

#two-way anova
mod_anova <- aov(stress_data~machine_head+machine, data= df1)
summary(mod_anova)

##              Df Sum Sq Mean Sq F value Pr(>F)
## machine_head  3   11.7   3.883   0.361  0.781
## machine       4   53.6  13.394   1.245  0.300
## Residuals    72  774.7  10.760

Null hypotheses: tau1=tau2…=0 HA: at least one is not 0 F-value: 1.26 and p-value >0.05, fail to reject H0. Does not appear to be difference among the 5 machines.

and Ho: σβ^2 = 0 vs. Ha: σβ^2>0 Fail to reject H0.

SAT Data

sat_data <-read.delim("sat_data.txt") #read text file
head(sat_data)

plot(sat_data$expend,sat_data$sat)

#exploratory data analysis
library(ggplot2)
sat_data%>% ggplot(aes(x=expend,y=sat))+geom_point() + geom_text(aes(label=state)) + geom_smooth(method = lm)

## `geom_smooth()` using formula 'y ~ x'

summary(sat_data$sat)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   844.0   897.2   945.5   965.9  1032.0  1107.0

summary(sat_data$expend)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.656   4.882   5.768   5.905   6.434   9.774

cor( sat_data$expend,sat_data$sat)

## [1] -0.380537

cor.test(sat_data$sat, sat_data$expend)

## 
##  Pearson's product-moment correlation
## 
## data:  sat_data$sat and sat_data$expend
## t = -2.8509, df = 48, p-value = 0.006408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5957789 -0.1142957
## sample estimates:
##       cor 
## -0.380537

From the scatter plot there appears to be a negative relationship between expenditure and SAT scores. This is also confirmed by the correlation coefficient of -0.38 (p-value = 0.006). That is, the states that score highest on the SAT are the states that, on average, spend less money per student. For example, high spending states such as New Jersey, New York, Alaska and Connecticut, all post aggregate scores well below the national average combined score of 966.

linear_model <- lm(sat~expend, data = sat_data)
summary(linear_model)

## 
## Call:
## lm(formula = sat ~ expend, data = sat_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.074  -46.821    4.087   40.034  128.489 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1089.294     44.390  24.539  < 2e-16 ***
## expend       -20.892      7.328  -2.851  0.00641 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69.91 on 48 degrees of freedom
## Multiple R-squared:  0.1448, Adjusted R-squared:  0.127 
## F-statistic: 8.128 on 1 and 48 DF,  p-value: 0.006408

#mlr
linear_model2 <- lm(sat~expend + ratio + salary + frac, data=sat_data)
summary(linear_model2)

## 
## Call:
## lm(formula = sat ~ expend + ratio + salary + frac, data = sat_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.531 -20.855  -1.746  15.979  66.571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1045.9715    52.8698  19.784  < 2e-16 ***
## expend         4.4626    10.5465   0.423    0.674    
## ratio         -3.6242     3.2154  -1.127    0.266    
## salary         1.6379     2.3872   0.686    0.496    
## frac          -2.9045     0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared:  0.8246, Adjusted R-squared:  0.809 
## F-statistic: 52.88 on 4 and 45 DF,  p-value: < 2.2e-16

Selection method with the AIC

sat.fit <-lm(sat~expend+frac, data = sat_data)
summary(sat.fit)

## 
## Call:
## lm(formula = sat ~ expend + frac, data = sat_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -88.400 -22.884   1.968  19.142  68.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 993.8317    21.8332  45.519  < 2e-16 ***
## expend       12.2865     4.2243   2.909  0.00553 ** 
## frac         -2.8509     0.2151 -13.253  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.46 on 47 degrees of freedom
## Multiple R-squared:  0.8195, Adjusted R-squared:  0.8118 
## F-statistic: 106.7 on 2 and 47 DF,  p-value: < 2.2e-16

par(mfrow = c(2,2))
plot(sat.fit)

From the residual vs fit plot, we can assume linearity and have equal variances. The QQ plot looks okay too and so we can assume normality. The Standardized residuals vs fitted values plot show there are no outliers in the data set.

With Matrix

cases <- matrix(c(8,22,14,21,42,12,71,90,19),ncol = 3, byrow = TRUE)
rownames(cases) <- c("age less than or equal 21","between 22 and 26","greater than or equal 27")
colnames(cases) <- c("0 accidents","1 or 2", "3 or more")
cases<-as.table(cases)
cases

##                           0 accidents 1 or 2 3 or more
## age less than or equal 21           8     22        14
## between 22 and 26                  21     42        12
## greater than or equal 27           71     90        19

margin.table(cases) # totals

## [1] 299

margin.table(cases, 1) # row totals

## age less than or equal 21         between 22 and 26  greater than or equal 27 
##                        44                        75                       180

margin.table(cases,2) #column totals

## 0 accidents      1 or 2   3 or more 
##         100         154          45

round(prop.table(cases),2) # table proportion

##                           0 accidents 1 or 2 3 or more
## age less than or equal 21        0.03   0.07      0.05
## between 22 and 26                0.07   0.14      0.04
## greater than or equal 27         0.24   0.30      0.06

margin.table(prop.table(cases)) # total prop

## [1] 1

prop.table(cases,1) #row proportions

##                           0 accidents    1 or 2 3 or more
## age less than or equal 21   0.1818182 0.5000000 0.3181818
## between 22 and 26           0.2800000 0.5600000 0.1600000
## greater than or equal 27    0.3944444 0.5000000 0.1055556

prop.table(cases,2) #column proportions

##                           0 accidents    1 or 2 3 or more
## age less than or equal 21   0.0800000 0.1428571 0.3111111
## between 22 and 26           0.2100000 0.2727273 0.2666667
## greater than or equal 27    0.7100000 0.5844156 0.4222222

margin.table(prop.table(cases,2),2)

## 0 accidents      1 or 2   3 or more 
##           1           1           1

#clusterd bargraph
barplot(cases, legend=T, beside = T, main = "Accidents")

#stacked bargraph
barplot(cases, legend=T, beside = F, main = "Accidents")

plot(cases, main = "Accidents")

#Expected counts
Expected <- as.array(margin.table(cases,1)) %*% t(as.array(margin.table(cases,2))) /
margin.table(cases)
Expected <- as.table(Expected)
Expected

##                           0 accidents    1 or 2 3 or more
## age less than or equal 21   14.715719 22.662207  6.622074
## between 22 and 26           25.083612 38.628763 11.287625
## greater than or equal 27    60.200669 92.709030 27.090301

barplot(Expected, legend = T, beside = T, main = "Expected")

summary(cases)

## Number of cases in table: 299 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 16.741, df = 4, p-value = 0.00217

Injury Data

injury_data <- read.csv("INJURY9.csv")
injury_data

plot(INJURY~CARCLASS, data = injury_data)

summary(injury_data)

##     CARCLASS      INJURY      
##  CLASS5 :23   Min.   : 52.00  
##  CLASS1 :20   1st Qu.: 78.00  
##  CLASS4 :18   Median : 96.50  
##  CLASS2 :13   Mean   : 99.65  
##  CLASS8 :13   3rd Qu.:118.25  
##  CLASS9 : 8   Max.   :177.00  
##  (Other):17

anova_injury <- aov(INJURY~CARCLASS, data = injury_data)
summary(anova_injury)

##              Df Sum Sq Mean Sq F value Pr(>F)    
## CARCLASS      8  54762    6845    22.8 <2e-16 ***
## Residuals   103  30917     300                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p-value < 0.05, reject H0. at least one population means are different from the other group means.

Assumptions - normality,independence, equal variance

par(mfrow = c(2,2))
plot(anova_model, which = c(1,2))

attributes(anova_injury)

## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "contrasts"     "xlevels"       "call"          "terms"        
## [13] "model"        
## 
## $class
## [1] "aov" "lm"

Interval Plot

library(gplots)
plotmeans(mileage~car_group, data = gas_add1, mean.labels = T, main = "Interval plot of W,X,Y and Z")

Tukey Simultaneous Tests for Differences of Means

tukey.test <- TukeyHSD(anova_model)
tukey.test

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = mileage ~ car_group, data = gas_add1)
## 
## $car_group
##           diff        lwr       upr     p adj
## X-W -3.4000000 -4.6097284 -2.190272 0.0000009
## Y-W  3.5666667  2.3569382  4.776395 0.0000004
## Z-W  3.9500000  2.7402716  5.159728 0.0000001
## Y-X  6.9666667  5.7569382  8.176395 0.0000000
## Z-X  7.3500000  6.1402716  8.559728 0.0000000
## Z-Y  0.3833333 -0.8263951  1.593062 0.8116208

plot(tukey.test)

if interval does not contain zero, the corresponding means are significantly different.

Alternative - Grouping Information Using the Tukey Method and 95% Confidence

library(agricolae)
tukey.test2 <- HSD.test(anova_model, trt = "car_group")
tukey.test2

## $statistics
##     MSerror Df     Mean       CV      MSD
##   0.5604167 20 32.39583 2.310821 1.209728
## 
## $parameters
##    test    name.t ntr StudentizedRange alpha
##   Tukey car_group   4         3.958293  0.05
## 
## $means
##    mileage       std r  Min  Max    Q25   Q50    Q75
## W 31.36667 0.8824209 6 30.1 32.6 30.900 31.35 31.875
## X 27.96667 0.5501515 6 27.4 28.7 27.525 27.85 28.400
## Y 34.93333 0.8869423 6 33.9 36.1 34.200 34.95 35.550
## Z 35.31667 0.6112828 6 34.5 36.2 34.975 35.25 35.675
## 
## $comparison
## NULL
## 
## $groups
##    mileage groups
## Z 35.31667      a
## Y 34.93333      a
## W 31.36667      b
## X 27.96667      c
## 
## attr(,"class")
## [1] "group"

Means that do not share a letter are significantly different. recommend additives Y or Z.

Design of Experiment

#observation studies
data2<- rbind(c(191,304), c(300,641))
data2

##      [,1] [,2]
## [1,]  191  304
## [2,]  300  641

fisher.test(data2, alternative = "greater")

## 
##  Fisher's Exact Test for Count Data
## 
## data:  data2
## p-value = 0.006593
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  1.101837      Inf
## sample estimates:
## odds ratio 
##   1.342128

#qqPlot for point-wise 95% confidence, points within the envelope are considered normal
library(car)
qqPlot(data2)

## [1] 4 1

acids <- c(1.697,1.601,1.830,2.032,2.017,2.409,2.211,1.673,1.973,2.091,2.255,2.987)
R50 <- rep(c("no","yes","no","yes"), each = 3)
R21 <- rep(c("no", "no","yes","yes"), each = 3)
cheddar <- data.frame(R50, R21, acids)
cheddar

library(phia)
modelAB<-aov(acids~R50+R21+R50:R21, data = cheddar)
summary(modelAB)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## R50          1 0.6561  0.6561   7.234 0.0275 *
## R21          1 0.2144  0.2144   2.364 0.1627  
## R50:R21      1 0.0018  0.0018   0.020 0.8922  
## Residuals    8 0.7257  0.0907                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Larger p-value of interaction indicates that there may no interaction effect in this model.

IM <- interactionMeans(modelAB)
plot(IM)

dummy.coef(modelAB)

## Full coefficients are 
##                                                            
## (Intercept):      1.709333                                 
## R50:                    no        yes                      
##                  0.0000000  0.4433333                      
## R21:                    no        yes                      
##                      0.000      0.243                      
## R50:R21:             no:no     yes:no     no:yes    yes:yes
##                 0.00000000 0.00000000 0.00000000 0.04866667

#model without interaction
mod_2 <-aov(acids~R50+R21, data = cheddar)
summary(mod_2)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## R50          1 0.6561  0.6561   8.118 0.0191 *
## R21          1 0.2144  0.2144   2.653 0.1378  
## Residuals    9 0.7274  0.0808                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R50 has more effect on cheese quality.

library(ACSWR)
data("battery")
battery$Material <- as.factor(battery$Material)
battery$Temperature <- as.factor(battery$Temperature)
battery

#ANOVA
mod_3 <- aov(Life~Material + Temperature+Material*Temperature, data = battery)
summary(mod_3)

##                      Df Sum Sq Mean Sq F value   Pr(>F)    
## Material              2  10684    5342   7.911  0.00198 ** 
## Temperature           2  39119   19559  28.968 1.91e-07 ***
## Material:Temperature  4   9614    2403   3.560  0.01861 *  
## Residuals            27  18231     675                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p-value for interaction is <0.05, reject H0. Conclude there is interaction.

#check model assumptions
par(mfrow= c(2,2))
plot(mod_3)

plot(mod_3, which = 2)

shapiro.test(mod_3$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  mod_3$residuals
## W = 0.97606, p-value = 0.6117

#boxplot
boxplot(Life~Material*Temperature, data = battery)

library(car)
leveneTest(Life~Material*Temperature, data = battery)

Random Statistics Topics

Min Tamang

04/10/2020