Instructions

Exercises: 4 (Pg. 302); 1 (Pgs. 316-317); 1 (Pgs. 328-329); 1, 2 (Pgs. 353-354)

Submission: Submit via an electronic document on Sakai. Must be submitted as a HTML file generated in RStudio. All assigned problems are chosen according to the textbook R for Data Science. You do not need R code to answer every question. If you answer without using R code, delete the code chunk. If the question requires R code, make sure you display R code. If the question requires a figure, make sure you display a figure. A lot of the questions can be answered in written response, but require R code and/or figures for understanding and explaining.

Chapter 16 (Pg. 302)

Exercise 4

a)

test <- c(1, 3, 4, 7, 8, 10, 11, 15)
lastVal <- function(x) {
  return(x[length(x)])
}
lastVal(test)
## [1] 15

b)

evenInd <- function(x) {
  x[seq(2, length(x), by = 2)]
}
evenInd(test)
## [1]  3  7 10 15

c)

exclLast <- function(x) {
  x[seq(1, length(x) - 1)]
}
exclLast(test)
## [1]  1  3  4  7  8 10 11

d)

evenNum <- function(x) {
  x[x %% 2 == 0]
}
evenNum(test)
## [1]  4  8 10

Chapter 17 (Pgs. 316-317)

Exercise 1

a)

output <- vector("double", ncol(mtcars))
for (i in seq_along(mtcars)) {
  output[[i]] <- mean(mtcars[[i]])
}
output
##  [1]  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250
##  [7]  17.848750   0.437500   0.406250   3.687500   2.812500

b)

library(nycflights13)
output <- vector("double", ncol(flights))
for (i in seq_along(flights)) {
  output[[i]] <- typeof(flights[[i]])
}
output
##  [1] "integer"   "integer"   "integer"   "integer"   "integer"   "double"   
##  [7] "integer"   "integer"   "double"    "character" "integer"   "character"
## [13] "character" "character" "double"    "double"    "double"    "double"   
## [19] "double"

c)

output <- vector("double", ncol(iris))
for (i in seq_along(iris)) {
  output[[i]] <- n_distinct(iris[[i]])
}
output
## [1] 35 23 43 22  3

d)

avg <- c(-10, 0, 10, 100)
n <- 10
output <- vector("list", length(avg))
for (i in seq_along(avg)) {
  output[[i]] <- rnorm(n, avg[[i]])
}
output
## [[1]]
##  [1]  -9.407518 -10.634119  -9.898807 -10.232254  -9.655187  -9.135446
##  [7] -10.152907  -9.940554  -9.859148 -12.159881
## 
## [[2]]
##  [1] -2.178751270 -0.753577827 -0.131582717 -0.030505937  0.008757949
##  [6] -1.018524027 -1.585030423  0.613875827  1.571965992  1.211366752
## 
## [[3]]
##  [1]  9.773033 10.091090  9.295772 10.226379 10.422470 11.323002  9.528319
##  [8] 10.929249  7.950970 10.249809
## 
## [[4]]
##  [1]  98.25050 100.93517 100.97391 101.65958  99.82295  99.56524  99.28399
##  [8] 100.15540  98.56591  99.94817

Chapter 17 (Pgs. 328-329)

Exercise 1

a)

map_dbl(mtcars, mean)
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500

b)

map_chr(flights, typeof)
##           year          month            day       dep_time sched_dep_time 
##      "integer"      "integer"      "integer"      "integer"      "integer" 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##       "double"      "integer"      "integer"       "double"    "character" 
##         flight        tailnum         origin           dest       air_time 
##      "integer"    "character"    "character"    "character"       "double" 
##       distance           hour         minute      time_hour 
##       "double"       "double"       "double"       "double"

c)

map_int(iris, n_distinct)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           35           23           43           22            3

d)

avg <- c(-10, 0, 10, 100)
map(avg, rnorm, n = 10)
## [[1]]
##  [1] -10.058524 -11.508636  -9.923234  -7.546378 -11.555733  -8.784430
##  [7]  -9.080017 -10.266738  -9.550265  -8.929867
## 
## [[2]]
##  [1] -0.7394196  1.4394611  0.9974425 -1.7314740 -0.1097655 -1.3902117
##  [7]  0.5072032 -1.3247428  0.6851473 -0.8509349
## 
## [[3]]
##  [1] 10.908294  8.199825 10.631607  9.042093  9.104534  8.094065  8.600622
##  [8] 10.475350  9.330712  9.874832
## 
## [[4]]
##  [1]  99.79590 100.47544 101.49889  98.61083 100.89766  99.94564  99.88350
##  [8]  98.51162  98.56632  98.66525

Chapter 18 (Pgs. 353-354)

Exercise 1

sim1a <- tibble(
x = rep(1:10, each = 3), y=x*1.5+6+rt(length(x),df=2)
)

sim1a_mod <- lm(y ~ x, data = sim1a)

ggplot(sim1a, aes(x, y)) + geom_point(size = 2, color = "red") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(sim1a_mod, aes(x, y)) + geom_point(size = 2, color = "red") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

When I tried fitting a linear model to the data, I noticed that the slope became more linear and increased in strength and I noticed that there were fewer outliers.

Exercise 2

make_prediction <- function(mod, data){
  mod[1] + data$x * mod[2]
}
measure_distance <- function(mod, data) {
  diff <- data$y - make_prediction(mod, data)
  mean(abs(diff)) 
}

best <- optim(c(0, 0), measure_distance, data = sim1a)

ggplot(sim1a, aes(x, y)) +
    geom_point(size = 2, color = "grey30") +
    geom_abline(intercept = best$par[1], slope = best$par[2])

Using optim() to fit this model creates a better model than the linear model because the line is straighter and the model appears better because the data points fit the slope better. However, there is an overt outlier that did not appear in the linear model.