date()
## [1] "Thu Oct 27 23:08:10 2016"
Due Date: October 27, 2016
Total Points: 100
1 Use the reaction.time data set from UsingR package to test the hypothesis that older people have slower reaction times. Compute the conditional mean reaction times and make side-by-side box plots.
library(UsingR)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
reaction.time
## age gender control time
## 1 16-24 F T 1.360075
## 2 16-24 M T 1.467939
## 3 16-24 M T 1.512036
## 4 16-24 F T 1.390647
## 5 16-24 M T 1.384208
## 6 16-24 M C 1.393875
## 7 16-24 M T 1.419043
## 8 16-24 F T 1.461187
## 9 16-24 F T 1.382461
## 10 16-24 M C 1.260152
## 11 16-24 M T 1.292917
## 12 16-24 F T 1.331092
## 13 16-24 M T 1.369171
## 14 16-24 F C 1.330471
## 15 16-24 F T 1.344506
## 16 16-24 M C 1.357869
## 17 16-24 F T 1.520244
## 18 16-24 M T 1.352397
## 19 16-24 F T 1.348420
## 20 16-24 F T 1.373000
## 21 25+ M C 1.306813
## 22 25+ F C 1.384192
## 23 25+ M T 1.345643
## 24 25+ M C 1.594901
## 25 25+ M T 1.341972
## 26 25+ F T 1.472212
## 27 25+ F T 1.440685
## 28 25+ M T 1.502757
## 29 25+ F T 1.501541
## 30 25+ F T 1.562813
## 31 25+ F C 1.475851
## 32 25+ F T 1.475456
## 33 25+ M C 1.312376
## 34 25+ F T 1.447160
## 35 25+ M T 1.463118
## 36 25+ M C 1.293425
## 37 25+ F C 1.402860
## 38 25+ M C 1.245497
## 39 25+ F T 1.475806
## 40 25+ M T 1.520106
## 41 25+ F T 1.437079
## 42 25+ F C 1.583610
## 43 25+ F T 1.499246
## 44 25+ M C 1.436701
## 45 25+ F T 1.524056
## 46 25+ F T 1.510929
## 47 25+ F C 1.522563
## 48 25+ M C 1.462412
## 49 25+ F C 1.313935
## 50 25+ F T 1.440382
## 51 25+ M T 1.444503
## 52 25+ F C 1.316956
## 53 25+ M T 1.517435
## 54 25+ M T 1.536446
## 55 25+ M T 1.444865
## 56 25+ M C 1.442599
## 57 25+ M C 1.355206
## 58 25+ F T 1.614979
## 59 25+ M T 1.527699
## 60 25+ M T 1.466591
ggplot(reaction.time, aes(x = age, y = time)) +
geom_boxplot()
t.test(time ~ age, data = reaction.time)
##
## Welch Two Sample t-test
##
## data: time by age
## t = -3.2509, df = 49.028, p-value = 0.002083
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.10760560 -0.02539224
## sample estimates:
## mean in group 16-24 mean in group 25+
## 1.382586 1.449084
2 The mtcars dataset contains the miles per gallon and whether or not the transmission is automatic (0 = automatic, 1 = manual) for 32 automobiles.
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
ggplot(mtcars, aes(mpg)) +
geom_histogram(bins = 3)
wilcox.test(mtcars$mpg ~ mtcars$am)
## Warning in wilcox.test.default(x = c(21.4, 18.7, 18.1, 14.3, 24.4, 22.8, :
## cannot compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: mtcars$mpg by mtcars$am
## W = 42, p-value = 0.001871
## alternative hypothesis: true location shift is not equal to 0
#I conclude that there is no difference in mpg between automatic and manual transmissions because of the very small p-value
3 The data set trees (datasets package, part of base install) contains the girth (inches), height (feet) and volume of timber from 31 felled black cherry trees. Suppose you want to predict tree volume from girth.
library(datasets)
trees
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
## 7 11.0 66 15.6
## 8 11.0 75 18.2
## 9 11.1 80 22.6
## 10 11.2 75 19.9
## 11 11.3 79 24.2
## 12 11.4 76 21.0
## 13 11.4 76 21.4
## 14 11.7 69 21.3
## 15 12.0 75 19.1
## 16 12.9 74 22.2
## 17 12.9 85 33.8
## 18 13.3 86 27.4
## 19 13.7 71 25.7
## 20 13.8 64 24.9
## 21 14.0 78 34.5
## 22 14.2 80 31.7
## 23 14.5 74 36.3
## 24 16.0 72 38.3
## 25 16.3 77 42.6
## 26 17.3 81 55.4
## 27 17.5 82 55.7
## 28 17.9 80 58.3
## 29 18.0 80 51.5
## 30 18.0 80 51.0
## 31 20.6 87 77.0
scattree = ggplot(trees, aes(x = Girth, y = Volume)) +
geom_point() +
xlab("Tree Girth") +
ylab("Tree Volume")
scattree
scattree = scattree + geom_smooth(method = lm, se = FALSE, col = "blue")
scattree
scattree = scattree + geom_segment(aes(Volume = predict(lm(Volume ~ Girth)), yend = Volume, x = Girth, xend = Girth), col = "blue")
scattree
valtree = sum(residuals(lm(trees$Volume ~ trees$Girth))^2)
valtree
## [1] 524.3025
scattree2 = ggplot(trees, aes(x = (Girth)^2, y = Volume)) +
geom_point() +
xlab("Tree Girth") +
ylab("Tree Volume")
scattree2
Girth2 = (trees$Girth)^2
Girth2
## [1] 68.89 73.96 77.44 110.25 114.49 116.64 121.00 121.00 123.21 125.44
## [11] 127.69 129.96 129.96 136.89 144.00 166.41 166.41 176.89 187.69 190.44
## [21] 196.00 201.64 210.25 256.00 265.69 299.29 306.25 320.41 324.00 324.00
## [31] 424.36
scattree2 = scattree2 + geom_smooth(method = lm, se = FALSE, col = "blue")
scattree2
scattree2 = scattree2 + geom_segment(aes(Volume = predict(lm(Volume ~ Girth2)), yend = Volume, x = Girth2, xend = Girth2), col = "blue")
scattree2
valtree2 = sum(residuals(lm(trees$Volume ~ trees$Girth))^2)
valtree2
## [1] 524.3025
#I prefer the model using the square of Girth. It seems to provide a more reliable liniar regression and has a smaller residual value.
4 The data set USmelanoma (HSAUR2 package) contains male mortality counts per one million inhabitants by state along with the latitude and longitude centroid of the state.
library(HSAUR2)
## Loading required package: tools
USmelanoma
## mortality latitude longitude ocean
## Alabama 219 33.0 87.0 yes
## Arizona 160 34.5 112.0 no
## Arkansas 170 35.0 92.5 no
## California 182 37.5 119.5 yes
## Colorado 149 39.0 105.5 no
## Connecticut 159 41.8 72.8 yes
## Delaware 200 39.0 75.5 yes
## District of Columbia 177 39.0 77.0 no
## Florida 197 28.0 82.0 yes
## Georgia 214 33.0 83.5 yes
## Idaho 116 44.5 114.0 no
## Illinois 124 40.0 89.5 no
## Indiana 128 40.2 86.2 no
## Iowa 128 42.2 93.8 no
## Kansas 166 38.5 98.5 no
## Kentucky 147 37.8 85.0 no
## Louisiana 190 31.2 91.8 yes
## Maine 117 45.2 69.0 yes
## Maryland 162 39.0 76.5 yes
## Massachusetts 143 42.2 71.8 yes
## Michigan 117 43.5 84.5 no
## Minnesota 116 46.0 94.5 no
## Mississippi 207 32.8 90.0 yes
## Missouri 131 38.5 92.0 no
## Montana 109 47.0 110.5 no
## Nebraska 122 41.5 99.5 no
## Nevada 191 39.0 117.0 no
## New Hampshire 129 43.8 71.5 yes
## New Jersey 159 40.2 74.5 yes
## New Mexico 141 35.0 106.0 no
## New York 152 43.0 75.5 yes
## North Carolina 199 35.5 79.5 yes
## North Dakota 115 47.5 100.5 no
## Ohio 131 40.2 82.8 no
## Oklahoma 182 35.5 97.2 no
## Oregon 136 44.0 120.5 yes
## Pennsylvania 132 40.8 77.8 no
## Rhode Island 137 41.8 71.5 yes
## South Carolina 178 33.8 81.0 yes
## South Dakota 86 44.8 100.0 no
## Tennessee 186 36.0 86.2 no
## Texas 229 31.5 98.0 yes
## Utah 142 39.5 111.5 no
## Vermont 153 44.0 72.5 yes
## Virginia 166 37.5 78.5 yes
## Washington 117 47.5 121.0 yes
## West Virginia 136 38.8 80.8 no
## Wisconsin 110 44.5 90.2 no
## Wyoming 134 43.0 107.5 no
mort = ggplot(USmelanoma, aes(x = latitude, y = mortality)) +
geom_point() +
xlab("Latitude") +
ylab("Mortality")
mort
mort = mort + geom_smooth(method = lm, se = FALSE, col = "magenta")
mort
summary(lm(USmelanoma$mortality ~ USmelanoma$latitude + USmelanoma$longitude + USmelanoma$ocean))
##
## Call:
## lm(formula = USmelanoma$mortality ~ USmelanoma$latitude + USmelanoma$longitude +
## USmelanoma$ocean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.171 -10.070 -2.607 8.781 41.805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 349.2369 27.0596 12.906 < 2e-16 ***
## USmelanoma$latitude -5.4950 0.5289 -10.390 1.55e-13 ***
## USmelanoma$longitude 0.1219 0.1732 0.704 0.485245
## USmelanoma$oceanyes 21.7976 5.2263 4.171 0.000137 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.48 on 45 degrees of freedom
## Multiple R-squared: 0.7721, Adjusted R-squared: 0.7569
## F-statistic: 50.83 on 3 and 45 DF, p-value: 1.7e-14
#The slope coefficient indicates a negative regression, meaning the higher the latitude, the lower number of male mortalities.
resmort = sum(residuals(lm(USmelanoma$mortality ~ USmelanoma$latitude))^2)
resmort
## [1] 17173.07