Exercises: 4 (Pg. 302); 1 (Pgs. 316-317); 1 (Pgs. 328-329); 1, 2 (Pgs. 353-354)
Submission: Submit via an electronic document on Sakai. Must be submitted as a HTML file generated in RStudio. All assigned problems are chosen according to the textbook R for Data Science. You do not need R code to answer every question. If you answer without using R code, delete the code chunk. If the question requires R code, make sure you display R code. If the question requires a figure, make sure you display a figure. A lot of the questions can be answered in written response, but require R code and/or figures for understanding and explaining.
test <- c(1, 3, 4, 7, 8, 10, 11, 15)
lastVal <- function(x) {
return(x[length(x)])
}
lastVal(test)
## [1] 15
evenInd <- function(x) {
x[seq(2, length(x), by = 2)]
}
evenInd(test)
## [1] 3 7 10 15
exclLast <- function(x) {
x[seq(1, length(x) - 1)]
}
exclLast(test)
## [1] 1 3 4 7 8 10 11
evenNum <- function(x) {
x[x %% 2 == 0]
}
evenNum(test)
## [1] 4 8 10
output <- vector("double", ncol(mtcars))
for (i in seq_along(mtcars)) {
output[[i]] <- mean(mtcars[[i]])
}
output
## [1] 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250
## [7] 17.848750 0.437500 0.406250 3.687500 2.812500
library(nycflights13)
output <- vector("double", ncol(flights))
for (i in seq_along(flights)) {
output[[i]] <- typeof(flights[[i]])
}
output
## [1] "integer" "integer" "integer" "integer" "integer" "double"
## [7] "integer" "integer" "double" "character" "integer" "character"
## [13] "character" "character" "double" "double" "double" "double"
## [19] "double"
output <- vector("double", ncol(iris))
for (i in seq_along(iris)) {
output[[i]] <- n_distinct(iris[[i]])
}
output
## [1] 35 23 43 22 3
avg <- c(-10, 0, 10, 100)
n <- 10
output <- vector("list", length(avg))
for (i in seq_along(avg)) {
output[[i]] <- rnorm(n, avg[[i]])
}
output
## [[1]]
## [1] -9.407518 -10.634119 -9.898807 -10.232254 -9.655187 -9.135446
## [7] -10.152907 -9.940554 -9.859148 -12.159881
##
## [[2]]
## [1] -2.178751270 -0.753577827 -0.131582717 -0.030505937 0.008757949
## [6] -1.018524027 -1.585030423 0.613875827 1.571965992 1.211366752
##
## [[3]]
## [1] 9.773033 10.091090 9.295772 10.226379 10.422470 11.323002 9.528319
## [8] 10.929249 7.950970 10.249809
##
## [[4]]
## [1] 98.25050 100.93517 100.97391 101.65958 99.82295 99.56524 99.28399
## [8] 100.15540 98.56591 99.94817
map_dbl(mtcars, mean)
## mpg cyl disp hp drat wt qsec
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250 17.848750
## vs am gear carb
## 0.437500 0.406250 3.687500 2.812500
map_chr(flights, typeof)
## year month day dep_time sched_dep_time
## "integer" "integer" "integer" "integer" "integer"
## dep_delay arr_time sched_arr_time arr_delay carrier
## "double" "integer" "integer" "double" "character"
## flight tailnum origin dest air_time
## "integer" "character" "character" "character" "double"
## distance hour minute time_hour
## "double" "double" "double" "double"
map_int(iris, n_distinct)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 35 23 43 22 3
avg <- c(-10, 0, 10, 100)
map(avg, rnorm, n = 10)
## [[1]]
## [1] -10.058524 -11.508636 -9.923234 -7.546378 -11.555733 -8.784430
## [7] -9.080017 -10.266738 -9.550265 -8.929867
##
## [[2]]
## [1] -0.7394196 1.4394611 0.9974425 -1.7314740 -0.1097655 -1.3902117
## [7] 0.5072032 -1.3247428 0.6851473 -0.8509349
##
## [[3]]
## [1] 10.908294 8.199825 10.631607 9.042093 9.104534 8.094065 8.600622
## [8] 10.475350 9.330712 9.874832
##
## [[4]]
## [1] 99.79590 100.47544 101.49889 98.61083 100.89766 99.94564 99.88350
## [8] 98.51162 98.56632 98.66525
sim1a <- tibble(
x = rep(1:10, each = 3), y=x*1.5+6+rt(length(x),df=2)
)
sim1a_mod <- lm(y ~ x, data = sim1a)
ggplot(sim1a, aes(x, y)) + geom_point(size = 2, color = "red") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sim1a_mod, aes(x, y)) + geom_point(size = 2, color = "red") + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
When I tried fitting a linear model to the data, I noticed that the slope became more linear and increased in strength and I noticed that there were fewer outliers.
make_prediction <- function(mod, data){
mod[1] + data$x * mod[2]
}
measure_distance <- function(mod, data) {
diff <- data$y - make_prediction(mod, data)
mean(abs(diff))
}
best <- optim(c(0, 0), measure_distance, data = sim1a)
ggplot(sim1a, aes(x, y)) +
geom_point(size = 2, color = "grey30") +
geom_abline(intercept = best$par[1], slope = best$par[2])
Using optim() to fit this model creates a better model than the linear model because the line is straighter and the model appears better because the data points fit the slope better. However, there is an overt outlier that did not appear in the linear model.