Test 1 Solutions

1. Load the data:

setwd("/Users/traves/Dropbox/SM339/test1")
BT = read.csv("BaseballTimes.csv")
summary(BT)

##       Game   League      Runs          Margin         Pitchers   
##  ARI-SD :1   AL:7   Min.   : 3.0   Min.   : 1.00   Min.   : 4.0  
##  BOS-NYY:1   NL:8   1st Qu.: 5.5   1st Qu.: 1.00   1st Qu.: 5.0  
##  CHI-BAL:1          Median :10.0   Median : 4.00   Median : 7.0  
##  CHI-PIT:1          Mean   :10.1   Mean   : 3.87   Mean   : 8.2  
##  CIN-HOU:1          3rd Qu.:13.0   3rd Qu.: 5.00   3rd Qu.:10.5  
##  CLE-DET:1          Max.   :23.0   Max.   :12.00   Max.   :16.0  
##  (Other):9                                                       
##    Attendance         Time    
##  Min.   :13478   Min.   :133  
##  1st Qu.:17734   1st Qu.:154  
##  Median :30395   Median :168  
##  Mean   :29769   Mean   :183  
##  3rd Qu.:38102   3rd Qu.:194  
##  Max.   :55058   Max.   :317  
##

attach(BT)

Get mean and median of Time:

mean(Time)  #182.67 mins

## [1] 182.7

median(Time)  # 168

## [1] 168

Regression Line:

TP = lm(Time ~ Pitchers)
summary(TP)

## 
## Call:
## lm(formula = Time ~ Pitchers)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -37.94  -8.44  -3.10   9.75  50.79 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    94.84      13.39    7.08  8.2e-06 ***
## Pitchers       10.71       1.49    7.21  6.9e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 21.5 on 13 degrees of freedom
## Multiple R-squared:  0.8,    Adjusted R-squared: 0.784 
## F-statistic: 51.9 on 1 and 13 DF,  p-value: 6.88e-06

94.843 + 10.71 * 2  # = 116.263

## [1] 116.3

A game with only 2 pitchers lasts an average of 116.263 minutes and each extra pitcher increases this average time by 10.71 minutes.

Plotting:

plot(Time ~ Pitchers, pch = 19, col = "blue", xlab = "Number of Pitchers", ylab = "Length of Game (in minutes)", 
    main = "Length of Baseball Game versus Pitchers")
abline(TP, col = "green")

plot of chunk unnamed-chunk-4


pdf(file = "plotoutput.pdf")
plot(Time ~ Pitchers, pch = 19, col = "blue", xlab = "Number of Pitchers", ylab = "Length of Game (in minutes)", 
    main = "Length of Baseball Game versus Pitchers")
abline(TP, col = "green")
dev.off()

## pdf 
##   2

RSE:

summary(TP)

## 
## Call:
## lm(formula = Time ~ Pitchers)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -37.94  -8.44  -3.10   9.75  50.79 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    94.84      13.39    7.08  8.2e-06 ***
## Pitchers       10.71       1.49    7.21  6.9e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 21.5 on 13 degrees of freedom
## Multiple R-squared:  0.8,    Adjusted R-squared: 0.784 
## F-statistic: 51.9 on 1 and 13 DF,  p-value: 6.88e-06

# RSE = 21.46

Outliers:

plot(rstandard(TP) ~ TP$fitted, col = "blue", pch = 19)
abline(0, 0, col = "red")

plot of chunk unnamed-chunk-6

# outlier at about (270,3)
which(rstandard(TP) > 2)

## 15 
## 15

# Outlier is game 15
BT[15, ]

##       Game League Runs Margin Pitchers Attendance Time
## 15 NYM-PHI     NL   15      1       16      45204  317

rstandard(TP)[15]

##    15 
## 2.956

Confidence Interval:

meantime = mean(Time)
sdtime = sd(Time)
n = length(Time)
conf = 0.95
tstat = qt(0.975, n - 1)
low = meantime - tstat * sdtime/sqrt(n)
high = meantime + tstat * sdtime/sqrt(n)
print(c(low, high))

## [1] 157.1 208.3

Alternatively:

t.test(Time, conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  Time 
## t = 15.31, df = 14, p-value = 3.877e-10
## alternative hypothesis: true mean is not equal to 0 
## 95 percent confidence interval:
##  157.1 208.3 
## sample estimates:
## mean of x 
##     182.7

We can only conclude that the confidence interval is valid for the length of games played on that day, Aug. 26, 2008. However, if that day is representative of all days on which games are played then we might be justified in making the stronger claim in part (f).

2. Load Data:

MR = read.csv("MetabolicRate.csv")
summary(MR)

##     Computer      BodySize      LogBodySize          Instar    
##  Min.   :1.0   Min.   :0.002   Min.   :-2.6778   Min.   :1.00  
##  1st Qu.:1.0   1st Qu.:0.033   1st Qu.:-1.4815   1st Qu.:2.00  
##  Median :1.0   Median :0.167   Median :-0.7765   Median :3.00  
##  Mean   :1.5   Mean   :0.684   Mean   :-0.7389   Mean   :3.16  
##  3rd Qu.:2.0   3rd Qu.:1.030   3rd Qu.: 0.0128   3rd Qu.:4.00  
##  Max.   :3.0   Max.   :4.803   Max.   : 0.6815   Max.   :5.00  
##      CO2ppm           Mrate           LogMrate      
##  Min.   :   0.4   Min.   :  0.03   Min.   :-1.5477  
##  1st Qu.:  12.8   1st Qu.:  0.94   1st Qu.:-0.0265  
##  Median :  57.3   Median :  4.51   Median : 0.6541  
##  Mean   : 177.1   Mean   : 14.34   Mean   : 0.6294  
##  3rd Qu.: 205.9   3rd Qu.: 16.49   3rd Qu.: 1.2173  
##  Max.   :1467.3   Max.   :100.57   Max.   : 2.0025

attach(MR)

Plot 4 graphs:

plot(Mrate ~ BodySize, pch = 19, col = "blue")  # kind of linear but a lot of spread

plot of chunk unnamed-chunk-10

plot(Mrate ~ LogBodySize, pch = 19, col = "blue")  # not linear

plot of chunk unnamed-chunk-10

plot(LogMrate ~ BodySize, pch = 19, col = "blue")  # not linear

plot of chunk unnamed-chunk-10

plot(LogMrate ~ LogBodySize, pch = 19, col = "blue")  # good linear fit

plot of chunk unnamed-chunk-10

The strongest linear fit is in the log-log model: LogMrate ~ LogBodySize. The next strongest linear fit is in the original variables: Mrate ~ BodySize.

Regression:

MB = lm(LogMrate ~ LogBodySize)
summary(MB)

## 
## Call:
## lm(formula = LogMrate ~ LogBodySize)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5920 -0.1119  0.0036  0.1212  0.4729 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.3066     0.0136    96.3   <2e-16 ***
## LogBodySize   0.9164     0.0124    74.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.175 on 303 degrees of freedom
## Multiple R-squared: 0.948,   Adjusted R-squared: 0.948 
## F-statistic: 5.51e+03 on 1 and 303 DF,  p-value: <2e-16

REGRESSION LINE: LogMrate = 1.30655 + 0.91641*LogBodySize

10^(1.30655 + 0.01641 * log(base = 10, 1))  # = 20.25583

## [1] 20.26

The model predicts that the metabolic rate of a 1 gram caterpillar should be 20.25583.

Enter data:

Sleep = c(5.5, 5, 6, 6.5, 6, 6.5, 7, 10, 6.5, 8, 7)
Class = c(4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1)
Time = 4 - Class

Mean and dtandard deviation:

mean(Sleep)

## [1] 6.727

sd(Sleep)

## [1] 1.348

80% CI:

t.test(Sleep, conf.level = 0.8)

## 
##  One Sample t-test
## 
## data:  Sleep 
## t = 16.55, df = 10, p-value = 1.357e-08
## alternative hypothesis: true mean is not equal to 0 
## 80 percent confidence interval:
##  6.169 7.285 
## sample estimates:
## mean of x 
##     6.727

H0: \( \mu \) = 7.5 hours

versus

H1 (or HA): \( \mu \) < 7.5 hours

To actually do the test we look at the statistic \( (\bar{Sleep} - 7.5)/(sd(Sleep)/sqrt(11)) \) which is distributed as a T-distribution with 10 degrees of freedom.

(mean(Sleep) - 7.5)/(sd(Sleep)/sqrt(length(Sleep)))

## [1] -1.901

p = pt(-1.900658, df = 10)
p

## [1] 0.04326

The p-value is 0.04326425. This is less than the level of significance, 0.05, so we reject the null hypothesis. The data supports the alternative hypothesis. MIDN Dozy's claim is supported by the evidence.

However, MIDN Dozy should not try to extrapolate his infrerence to all college students since midshipmen are very special kinds of college students. As well, the date that MIDN Dozy chose to do his sampling on is actually a very special day (the first day of x-week) and so the responses will be different from students at USNA than at most other colleges, which don't have x-week exams.

MIDN Dozy's study was an observational study. He did not randomly assign midshipmen to a control and response group.

Sleep is a quantitative variable and Class is a qualitative variable.

Plots and linear models:

plot(Sleep ~ Time, col = "blue", pch = 19, xlab = "Time (in full years) at USNA", 
    ylab = "Amount of Sleep in hours", main = "Sleep versus Time")
ST = lm(Sleep ~ Time)
abline(ST, col = "red", lwd = 3)

plot of chunk unnamed-chunk-17

summary(ST)

## 
## Call:
## lm(formula = Sleep ~ Time)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0435 -0.5326 -0.1304  0.0652  2.7609 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.630      0.517   10.89  1.8e-06 ***
## Time           0.804      0.299    2.69    0.025 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 1.06 on 9 degrees of freedom
## Multiple R-squared: 0.446,   Adjusted R-squared: 0.385 
## F-statistic: 7.26 on 1 and 9 DF,  p-value: 0.0246

The general pattern revealed in the data is that the amount of sleep midshipmen get increases as they spend more time at USNA.

Residuals:

require(lattice)

## Loading required package: lattice

require(latticeExtra)

## Loading required package: latticeExtra

## Loading required package: RColorBrewer

densityplot(ST$residuals)

plot of chunk unnamed-chunk-18

The extra-long right tail means that the residuals are right-skewed.

Outlier:

plot(rstandard(ST) ~ ST$fitted)

plot of chunk unnamed-chunk-19

which(rstandard(ST) > 2)

## 8 
## 8

Sleep[8]

## [1] 10

Time[8]

## [1] 2

Sleep[8] - (5.6304 + 0.8043 * 2)  # residual is 2.761

## [1] 2.761

The outlier is the 2nd class that slept 10 hours. The residual is 2.761 hours.

The regression line with all data is: Sleep = 5.6304+0.8043*(TimeAtUSNA)

STo = lm(Sleep[-8] ~ Time[-8])
summary(STo)

## 
## Call:
## lm(formula = Sleep[-8] ~ Time[-8])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5620 -0.3151  0.0434  0.2934  0.5041 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.562      0.203   27.44  3.4e-09 ***
## Time[-8]       0.645      0.119    5.41  0.00063 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.414 on 8 degrees of freedom
## Multiple R-squared: 0.786,   Adjusted R-squared: 0.759 
## F-statistic: 29.3 on 1 and 8 DF,  p-value: 0.000635

The regression line computed without the outlier is: Sleep = 5.5620 + 0.64446*(TimeAtUSNA).

The regression line doesn't change that much because the outlier's explanatory variable (TimeAtUSNA) is close to its mean. This outlier does not have a lot of leverage. We can see this intuitively when looking at the two regression lines:

plot(Sleep ~ Time, col = "blue", pch = 19, xlab = "Time (in full years) at USNA", 
    ylab = "Amount of Sleep in hours", main = "Sleep versus Time")
legend(0.25, 9.5, c("Regression with all data", "Regression without outlier"), 
    lty = c(1, 1), lwd = c(3, 3), col = c("red", "green"))
abline(ST, col = "red", lwd = 3)
abline(STo, col = "green", lwd = 3)

plot of chunk unnamed-chunk-21