Problem Set # 4

Regina Mendicino

date()
## [1] "Thu Oct 27 23:08:10 2016"

Due Date: October 27, 2016

Total Points: 100

1 Use the reaction.time data set from UsingR package to test the hypothesis that older people have slower reaction times. Compute the conditional mean reaction times and make side-by-side box plots.

library(UsingR)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
## 
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
## 
##     cancer
reaction.time
##      age gender control     time
## 1  16-24      F       T 1.360075
## 2  16-24      M       T 1.467939
## 3  16-24      M       T 1.512036
## 4  16-24      F       T 1.390647
## 5  16-24      M       T 1.384208
## 6  16-24      M       C 1.393875
## 7  16-24      M       T 1.419043
## 8  16-24      F       T 1.461187
## 9  16-24      F       T 1.382461
## 10 16-24      M       C 1.260152
## 11 16-24      M       T 1.292917
## 12 16-24      F       T 1.331092
## 13 16-24      M       T 1.369171
## 14 16-24      F       C 1.330471
## 15 16-24      F       T 1.344506
## 16 16-24      M       C 1.357869
## 17 16-24      F       T 1.520244
## 18 16-24      M       T 1.352397
## 19 16-24      F       T 1.348420
## 20 16-24      F       T 1.373000
## 21   25+      M       C 1.306813
## 22   25+      F       C 1.384192
## 23   25+      M       T 1.345643
## 24   25+      M       C 1.594901
## 25   25+      M       T 1.341972
## 26   25+      F       T 1.472212
## 27   25+      F       T 1.440685
## 28   25+      M       T 1.502757
## 29   25+      F       T 1.501541
## 30   25+      F       T 1.562813
## 31   25+      F       C 1.475851
## 32   25+      F       T 1.475456
## 33   25+      M       C 1.312376
## 34   25+      F       T 1.447160
## 35   25+      M       T 1.463118
## 36   25+      M       C 1.293425
## 37   25+      F       C 1.402860
## 38   25+      M       C 1.245497
## 39   25+      F       T 1.475806
## 40   25+      M       T 1.520106
## 41   25+      F       T 1.437079
## 42   25+      F       C 1.583610
## 43   25+      F       T 1.499246
## 44   25+      M       C 1.436701
## 45   25+      F       T 1.524056
## 46   25+      F       T 1.510929
## 47   25+      F       C 1.522563
## 48   25+      M       C 1.462412
## 49   25+      F       C 1.313935
## 50   25+      F       T 1.440382
## 51   25+      M       T 1.444503
## 52   25+      F       C 1.316956
## 53   25+      M       T 1.517435
## 54   25+      M       T 1.536446
## 55   25+      M       T 1.444865
## 56   25+      M       C 1.442599
## 57   25+      M       C 1.355206
## 58   25+      F       T 1.614979
## 59   25+      M       T 1.527699
## 60   25+      M       T 1.466591
ggplot(reaction.time, aes(x = age, y = time)) +
  geom_boxplot() 

t.test(time ~ age, data = reaction.time)
## 
##  Welch Two Sample t-test
## 
## data:  time by age
## t = -3.2509, df = 49.028, p-value = 0.002083
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.10760560 -0.02539224
## sample estimates:
## mean in group 16-24   mean in group 25+ 
##            1.382586            1.449084

2 The mtcars dataset contains the miles per gallon and whether or not the transmission is automatic (0 = automatic, 1 = manual) for 32 automobiles.

mtcars
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
  1. Plot a histogram of the miles per gallon over all cars. Use a bin width of 3 mpg.
ggplot(mtcars, aes(mpg)) + 
  geom_histogram(bins = 3)

  1. Perform a Mann-Whitney (Wilcoxon) test under the hypothesis that there is no difference in mpg between automatic and manual transmission cars without assuming they follow a normal distribution. The alternative is there is a difference. What do you conclude?
wilcox.test(mtcars$mpg ~ mtcars$am)
## Warning in wilcox.test.default(x = c(21.4, 18.7, 18.1, 14.3, 24.4, 22.8, :
## cannot compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  mtcars$mpg by mtcars$am
## W = 42, p-value = 0.001871
## alternative hypothesis: true location shift is not equal to 0
#I conclude that there is no difference in mpg between automatic and manual transmissions because of the very small p-value

3 The data set trees (datasets package, part of base install) contains the girth (inches), height (feet) and volume of timber from 31 felled black cherry trees. Suppose you want to predict tree volume from girth.

library(datasets)
trees
##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0
  1. Create a scatter plot of the data and label the axes.
scattree = ggplot(trees, aes(x = Girth, y = Volume)) + 
  geom_point() + 
  xlab("Tree Girth") + 
  ylab("Tree Volume")
scattree

  1. Add a linear regression line to the plot.
scattree = scattree + geom_smooth(method = lm, se = FALSE, col = "blue")
scattree

  1. Determine the sum of squared residuals.
scattree = scattree + geom_segment(aes(Volume = predict(lm(Volume ~ Girth)), yend = Volume, x = Girth, xend = Girth), col = "blue")
scattree

valtree = sum(residuals(lm(trees$Volume ~ trees$Girth))^2)
valtree
## [1] 524.3025
  1. Repeat a, b, and c but use the square of the girth instead of girth as the explanatory variable. What model do you prefer and why?
scattree2 = ggplot(trees, aes(x = (Girth)^2, y = Volume)) + 
  geom_point() + 
  xlab("Tree Girth") + 
  ylab("Tree Volume")
scattree2

Girth2 = (trees$Girth)^2
Girth2
##  [1]  68.89  73.96  77.44 110.25 114.49 116.64 121.00 121.00 123.21 125.44
## [11] 127.69 129.96 129.96 136.89 144.00 166.41 166.41 176.89 187.69 190.44
## [21] 196.00 201.64 210.25 256.00 265.69 299.29 306.25 320.41 324.00 324.00
## [31] 424.36
scattree2 = scattree2 + geom_smooth(method = lm, se = FALSE, col = "blue")
scattree2

scattree2 = scattree2 + geom_segment(aes(Volume = predict(lm(Volume ~ Girth2)), yend = Volume, x = Girth2, xend = Girth2), col = "blue")
scattree2

valtree2 = sum(residuals(lm(trees$Volume ~ trees$Girth))^2)
valtree2
## [1] 524.3025
#I prefer the model using the square of Girth. It seems to provide a more reliable liniar regression and has a smaller residual value. 

4 The data set USmelanoma (HSAUR2 package) contains male mortality counts per one million inhabitants by state along with the latitude and longitude centroid of the state.

library(HSAUR2)
## Loading required package: tools
USmelanoma
##                      mortality latitude longitude ocean
## Alabama                    219     33.0      87.0   yes
## Arizona                    160     34.5     112.0    no
## Arkansas                   170     35.0      92.5    no
## California                 182     37.5     119.5   yes
## Colorado                   149     39.0     105.5    no
## Connecticut                159     41.8      72.8   yes
## Delaware                   200     39.0      75.5   yes
## District of Columbia       177     39.0      77.0    no
## Florida                    197     28.0      82.0   yes
## Georgia                    214     33.0      83.5   yes
## Idaho                      116     44.5     114.0    no
## Illinois                   124     40.0      89.5    no
## Indiana                    128     40.2      86.2    no
## Iowa                       128     42.2      93.8    no
## Kansas                     166     38.5      98.5    no
## Kentucky                   147     37.8      85.0    no
## Louisiana                  190     31.2      91.8   yes
## Maine                      117     45.2      69.0   yes
## Maryland                   162     39.0      76.5   yes
## Massachusetts              143     42.2      71.8   yes
## Michigan                   117     43.5      84.5    no
## Minnesota                  116     46.0      94.5    no
## Mississippi                207     32.8      90.0   yes
## Missouri                   131     38.5      92.0    no
## Montana                    109     47.0     110.5    no
## Nebraska                   122     41.5      99.5    no
## Nevada                     191     39.0     117.0    no
## New Hampshire              129     43.8      71.5   yes
## New Jersey                 159     40.2      74.5   yes
## New Mexico                 141     35.0     106.0    no
## New York                   152     43.0      75.5   yes
## North Carolina             199     35.5      79.5   yes
## North Dakota               115     47.5     100.5    no
## Ohio                       131     40.2      82.8    no
## Oklahoma                   182     35.5      97.2    no
## Oregon                     136     44.0     120.5   yes
## Pennsylvania               132     40.8      77.8    no
## Rhode Island               137     41.8      71.5   yes
## South Carolina             178     33.8      81.0   yes
## South Dakota                86     44.8     100.0    no
## Tennessee                  186     36.0      86.2    no
## Texas                      229     31.5      98.0   yes
## Utah                       142     39.5     111.5    no
## Vermont                    153     44.0      72.5   yes
## Virginia                   166     37.5      78.5   yes
## Washington                 117     47.5     121.0   yes
## West Virginia              136     38.8      80.8    no
## Wisconsin                  110     44.5      90.2    no
## Wyoming                    134     43.0     107.5    no
  1. Create a scatter plot of mortality versus latitude using latitude as the explanatory variable.
mort = ggplot(USmelanoma, aes(x = latitude, y = mortality)) + 
  geom_point() + 
  xlab("Latitude") + 
  ylab("Mortality")
mort

  1. Add the linear regression line to your scatter plot.
mort = mort + geom_smooth(method = lm, se = FALSE, col = "magenta")
mort

  1. Regress mortality on latitude and interpret the value of the slope coefficient.
summary(lm(USmelanoma$mortality ~ USmelanoma$latitude + USmelanoma$longitude + USmelanoma$ocean))
## 
## Call:
## lm(formula = USmelanoma$mortality ~ USmelanoma$latitude + USmelanoma$longitude + 
##     USmelanoma$ocean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -30.171 -10.070  -2.607   8.781  41.805 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          349.2369    27.0596  12.906  < 2e-16 ***
## USmelanoma$latitude   -5.4950     0.5289 -10.390 1.55e-13 ***
## USmelanoma$longitude   0.1219     0.1732   0.704 0.485245    
## USmelanoma$oceanyes   21.7976     5.2263   4.171 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.48 on 45 degrees of freedom
## Multiple R-squared:  0.7721, Adjusted R-squared:  0.7569 
## F-statistic: 50.83 on 3 and 45 DF,  p-value: 1.7e-14
#The slope coefficient indicates a negative regression, meaning the higher the latitude, the lower number of male mortalities. 
  1. Determine the sum of squared errors.
resmort = sum(residuals(lm(USmelanoma$mortality ~ USmelanoma$latitude))^2)
resmort
## [1] 17173.07
  1. Examine the model assumptions. What do you conclude?