Problem Set 3

Annie Kelley

date()
## [1] "Wed Nov 30 17:17:49 2022"

output: html_document


(library(knitr))
## Warning: package 'knitr' was built under R version 4.2.2
## [1] "knitr"     "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"
(library(UsingR))
## Warning: package 'UsingR' was built under R version 4.2.2
## Loading required package: MASS
## Loading required package: HistData
## Warning: package 'HistData' was built under R version 4.2.2
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 4.2.2
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
## 
##     cancer
##  [1] "UsingR"    "Hmisc"     "ggplot2"   "Formula"   "survival"  "lattice"  
##  [7] "HistData"  "MASS"      "knitr"     "stats"     "graphics"  "grDevices"
## [13] "utils"     "datasets"  "methods"   "base"
(library(datasets))
##  [1] "UsingR"    "Hmisc"     "ggplot2"   "Formula"   "survival"  "lattice"  
##  [7] "HistData"  "MASS"      "knitr"     "stats"     "graphics"  "grDevices"
## [13] "utils"     "datasets"  "methods"   "base"
(library(HSAUR2))
## Warning: package 'HSAUR2' was built under R version 4.2.2
## Loading required package: tools
##  [1] "HSAUR2"    "tools"     "UsingR"    "Hmisc"     "ggplot2"   "Formula"  
##  [7] "survival"  "lattice"   "HistData"  "MASS"      "knitr"     "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"
(library(ggplot2))
##  [1] "HSAUR2"    "tools"     "UsingR"    "Hmisc"     "ggplot2"   "Formula"  
##  [7] "survival"  "lattice"   "HistData"  "MASS"      "knitr"     "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"
(library(tidyr))
## Warning: package 'tidyr' was built under R version 4.2.2
##  [1] "tidyr"     "HSAUR2"    "tools"     "UsingR"    "Hmisc"     "ggplot2"  
##  [7] "Formula"   "survival"  "lattice"   "HistData"  "MASS"      "knitr"    
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"
(library(dplyr))
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##  [1] "dplyr"     "tidyr"     "HSAUR2"    "tools"     "UsingR"    "Hmisc"    
##  [7] "ggplot2"   "Formula"   "survival"  "lattice"   "HistData"  "MASS"     
## [13] "knitr"     "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [19] "methods"   "base"
(library(sm))
## Warning: package 'sm' was built under R version 4.2.2
## Package 'sm', version 2.2-5.7: type help(sm) for summary information
## 
## Attaching package: 'sm'
## The following object is masked from 'package:MASS':
## 
##     muscle
##  [1] "sm"        "dplyr"     "tidyr"     "HSAUR2"    "tools"     "UsingR"   
##  [7] "Hmisc"     "ggplot2"   "Formula"   "survival"  "lattice"   "HistData" 
## [13] "MASS"      "knitr"     "stats"     "graphics"  "grDevices" "utils"    
## [19] "datasets"  "methods"   "base"

1 Use the reaction.time data set from UsingR package to test the hypothesis that older people have slower reaction times. Compute the conditional mean reaction times and make side-by-side box plots.

rt <- as.data.frame(reaction.time, header = TRUE)
head(rt)
##     age gender control     time
## 1 16-24      F       T 1.360075
## 2 16-24      M       T 1.467939
## 3 16-24      M       T 1.512036
## 4 16-24      F       T 1.390647
## 5 16-24      M       T 1.384208
## 6 16-24      M       C 1.393875
old_rt <- as.data.frame(filter(rt, age == "25+"))

old_rt
##    age gender control     time
## 21 25+      M       C 1.306813
## 22 25+      F       C 1.384192
## 23 25+      M       T 1.345643
## 24 25+      M       C 1.594901
## 25 25+      M       T 1.341972
## 26 25+      F       T 1.472212
## 27 25+      F       T 1.440685
## 28 25+      M       T 1.502757
## 29 25+      F       T 1.501541
## 30 25+      F       T 1.562813
## 31 25+      F       C 1.475851
## 32 25+      F       T 1.475456
## 33 25+      M       C 1.312376
## 34 25+      F       T 1.447160
## 35 25+      M       T 1.463118
## 36 25+      M       C 1.293425
## 37 25+      F       C 1.402860
## 38 25+      M       C 1.245497
## 39 25+      F       T 1.475806
## 40 25+      M       T 1.520106
## 41 25+      F       T 1.437079
## 42 25+      F       C 1.583610
## 43 25+      F       T 1.499246
## 44 25+      M       C 1.436701
## 45 25+      F       T 1.524056
## 46 25+      F       T 1.510929
## 47 25+      F       C 1.522563
## 48 25+      M       C 1.462412
## 49 25+      F       C 1.313935
## 50 25+      F       T 1.440382
## 51 25+      M       T 1.444503
## 52 25+      F       C 1.316956
## 53 25+      M       T 1.517435
## 54 25+      M       T 1.536446
## 55 25+      M       T 1.444865
## 56 25+      M       C 1.442599
## 57 25+      M       C 1.355206
## 58 25+      F       T 1.614979
## 59 25+      M       T 1.527699
## 60 25+      M       T 1.466591
old_mean <- mean(old_rt$time)

old_mean
## [1] 1.449084
ggplot(rt, aes(y = time, x = age)) +
         geom_boxplot() +
         ylab("Reaction Time (seconds)") +
         xlab("Age")

2 The mtcars dataset contains the miles per gallon and whether or not the transmission is automatic (0 = automatic, 1 = manual) for 32 automobiles.

cars <- as.data.frame(mtcars, header = TRUE)
head(cars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
  1. Plot a histogram of the miles per gallon over all cars. Use a bin width of 3 mpg.
cars_p <- ggplot(cars, aes(mpg)) +
  geom_histogram(bins = 3) +
  xlab("MPG") +
  ylab("# of Car Models")

cars_p

  1. Perform a Mann-Whitney (Wilcoxon) test under the hypothesis that there is no difference in mpg between automatic and manual transmission cars without assuming they follow a normal distribution. The alternative is there is a difference. What do you conclude?
wilcox.test(cars$mpg ~ cars$am)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cars$mpg by cars$am
## W = 42, p-value = 0.001871
## alternative hypothesis: true location shift is not equal to 0

3 The data set trees (datasets package, part of base install) contains the girth (inches), height (feet) and volume of timber from 31 felled black cherry trees. Suppose you want to predict tree volume from girth.

head(trees)
##   Girth Height Volume
## 1   8.3     70   10.3
## 2   8.6     65   10.3
## 3   8.8     63   10.2
## 4  10.5     72   16.4
## 5  10.7     81   18.8
## 6  10.8     83   19.7
  1. Create a scatter plot of the data and label the axes.
tp <-
  trees %>%
  ggplot(aes(x = Girth, y = Volume)) +
  geom_point(color = "green3") +
  xlab("Tree Girth (in)") +
  ylab("Tree Volums (ft^3)")
tp

  1. Add a linear regression line to the plot.
tp2 <- tp + geom_smooth(method = lm, se = FALSE, color = "brown")
tp2
## `geom_smooth()` using formula = 'y ~ x'

  1. Determine the sum of squared residuals.
model <- lm(Volume ~ Girth, data = trees)

SSE <- sum(residuals(model)^2)
SSE
## [1] 524.3025
  1. Repeat a, b, and c but use the square of the girth instead of girth as the explanatory variable. Which model do you prefer and why?
trees2 <-
  trees %>%
  mutate(Girth2 = Girth ^ 2)

head(trees2)
##   Girth Height Volume Girth2
## 1   8.3     70   10.3  68.89
## 2   8.6     65   10.3  73.96
## 3   8.8     63   10.2  77.44
## 4  10.5     72   16.4 110.25
## 5  10.7     81   18.8 114.49
## 6  10.8     83   19.7 116.64
tr <- 
  trees2 %>%
  ggplot(aes(x = Girth2, y = Volume)) +
  geom_point(color = "green3") +
  xlab("Girth Squared (in)") +
  ylab("Tree Volume (ft^3)")
tr

tr2 <- tr + geom_smooth(method = lm, se = FALSE, color = "brown")
tr2
## `geom_smooth()` using formula = 'y ~ x'

model_tr <- lm(Volume ~ Girth2, data = trees2)

tr_SSE <- sum(residuals(model_tr)^2)
tr_SSE
## [1] 329.3191
### Since the SSE value is lower in the second model employing Girth 2, I favor it. As a result, the second model fits the data better since there are closer gaps between the observations and regression line.

4 The data set USmelanoma (HSAUR2 package) contains male mortality counts per one million inhabitants by state along with the latitude and longitude centroid of the state.

mel <- USmelanoma
head(mel)
##             mortality latitude longitude ocean
## Alabama           219     33.0      87.0   yes
## Arizona           160     34.5     112.0    no
## Arkansas          170     35.0      92.5    no
## California        182     37.5     119.5   yes
## Colorado          149     39.0     105.5    no
## Connecticut       159     41.8      72.8   yes
  1. Create a scatter plot of mortality versus latitude using latitude as the explanatory variable.
mm <-
  mel %>%
  ggplot(aes(x = latitude, y = mortality)) +
  geom_point(size = 1.5, color = "red4") +
  xlab("Latitude") +
  ylab("Male Mortality (per 1 million inhabitants)")
mm

  1. Add the linear regression line to your scatter plot.
mm2 <- mm + geom_smooth(method = lm, se = FALSE, size = 1, color = 'black')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
mm2
## `geom_smooth()` using formula = 'y ~ x'

  1. Regress mortality on latitude and interpret the value of the slope coefficient.
mm_model <- lm(mortality ~ latitude, data = mel)
summary(mm_model)
## 
## Call:
## lm(formula = mortality ~ latitude, data = mel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.972 -13.185   0.972  12.006  43.938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 389.1894    23.8123   16.34  < 2e-16 ***
## latitude     -5.9776     0.5984   -9.99 3.31e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared:  0.6798, Adjusted R-squared:  0.673 
## F-statistic:  99.8 on 1 and 47 DF,  p-value: 3.309e-13
  1. Determine the sum of squared errors.
mm_SSE <- sum(residuals(mm_model)^2)
mm_SSE
## [1] 17173.07
  1. Examine the model assumptions. What do you conclude?
ggplot(mel, aes(x = cut(latitude, breaks = 5), y = mortality)) + 
  geom_boxplot() +
  xlab("Latitude") + ylab("Mortality")

res <- resid(mm_model)

sm.density(res, model = "Normal")