library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(googlesheets4)
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
Go through the Program chapters 17 to 21 in R for Data Science by Garrett Grolemund, Hadley Wickham and do all the exercises that include programming. In addition, use the mtcars data set to perform hierarchical clustering on the cars with euclidean distance and Pearson correlation (use: dist.pear <- function(x) as.dist(1-cor(t(x))) as distance function). Next perform PCA and plot the scree plot. How many dimensions are reasonable in this dataset?
19.2.1 Exercises
foo_foo <- data.frame(name = "Foo Foo")
hop_through <- function(x, forest) {
x$location <- paste(x$name, "hopped through the", forest)
return(x)
}
scoop_up <- function(x, field_mouse) {
x$action <- paste(x$name, "scooped up the", field_mouse)
return(x)
}
bop_on_head <- function(x, field_mouse) {
x$action <- paste(x$action, "and bopped them on the head.")
return(x)
}
down_came <- function(x, fairy) {
x$arrival <- paste(fairy, "came down")
return(x)
}
said <- function(x, message) {
x$message <- paste(x$arrival, "and said", paste(message, collapse = " "))
return(x)
}
give_chances <- function(from, to, number, condition, consequence) {
message <- paste(from, "gave", to$name, number, "chances to change their ways.")
message <- paste(message, "But", to$name, "did not listen and", condition, ".")
message <- paste(message, "So", from, "turned", to$name, "into a", consequence, ".")
return(message)
}
threat <- function(chances) {
message <- give_chances(
from = "Good Fairy",
to = foo_foo,
number = chances,
condition = "didn't behave",
consequence = "goon"
)
return(message)
}
lyric <- function() {
foo_foo %>%
hop_through(forest = "forest") %>%
scoop_up(field_mouse = "field mice") %>%
bop_on_head(field_mouse = "field mice")
down_came(foo_foo, "Good Fairy")
said(foo_foo, c(
"Little bunny Foo Foo",
"I don't want to see you",
"Scooping up the field mice",
"And bopping them on the head."
))
return(foo_foo)
}
lyric()
## name
## 1 Foo Foo
threat(3)
## [1] "Good Fairy gave Foo Foo 3 chances to change their ways. But Foo Foo did not listen and didn't behave . So Good Fairy turned Foo Foo into a goon ."
lyric()
## name
## 1 Foo Foo
threat(2)
## [1] "Good Fairy gave Foo Foo 2 chances to change their ways. But Foo Foo did not listen and didn't behave . So Good Fairy turned Foo Foo into a goon ."
lyric()
## name
## 1 Foo Foo
threat(1)
## [1] "Good Fairy gave Foo Foo 1 chances to change their ways. But Foo Foo did not listen and didn't behave . So Good Fairy turned Foo Foo into a goon ."
lyric()
## name
## 1 Foo Foo
19.4.4 Exercises
Greetings <- function(time_of_the_day = lubridate::now()) {
Current_hour <- lubridate::hour(time_of_the_day)
if (Current_hour < 12) {
"Good Morning"
} else if (Current_hour < 18) {
"Good Afternoon"
} else {
"Good Night"
}
}
fizzbuzz<- function(k){
if (k%%3 == 0 && k%%5==0){
"fizzbuzz"
}
else if (k%%5 == 0){
"buzz"
}
else if (k%%3 ==0){
"fizz"
}
else {
k
}
}
fizzbuzz(21)
## [1] "fizz"
fizzbuzz(20)
## [1] "buzz"
fizzbuzz(15)
## [1] "fizzbuzz"
fizzbuzz(2)
## [1] 2
TemperatureCheck <- function(temp)
{
if (temp <= 0){"freezing"
} else if (temp <= 10){
"cold"
} else if (temp <= 20){
"cool"
} else if (temp <= 30){
"warm"
} else {
"hot"
}
}
TemperatureCheck <- function(temp)
cut(temp, c(-Inf, 0, 10, 20, 30, Inf), labels = c("freezing", "cold", "cool", "warm", "hot"))
How would you change the call to cut() if I’d used < instead of <=? What is the other chief advantage of cut() for this problem? (Hint: what happens if you have many values in temp?)
The switch statement can be used with numeric values, where it evaluates an expression and compares its value against each case label within the switch block. If the expression matches a case label, the code associated with that label will be executed. In case there is no match, the code associated with the default label (if provided) will be executed. If neither the expression matches any case label nor a default label is present, the switch statement will have no effect.
x= "a"
switch(x,
a = ,
b = "ab",
c =,
d = "cd"
)
## [1] "ab"
#If the slot is empty, the switch() call moves to the next slot that contains values
19.5.5 Exercises 1. What does commas(letters, collapse = “-”) do? Why?
library("stringr")
commas<- function(letters, collapse = "-"){
str_c(letters, collapse = "-")
}
commas(letters, collapse = "-")
## [1] "a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z"
#Running the code separates letters by `-`.
20.4.6 Exercises
TheLastVaue <- function(x) {
x[length(x)]
}
TheLastVaue
## function(x) {
## x[length(x)]
## }
Even_numbered <- function(x) {
x[seq(from = 2, to = length(x), by = 2)]
}
Even_numbered
## function(x) {
## x[seq(from = 2, to = length(x), by = 2)]
## }
LessLastVakue <- function(x) {
head(x, -1)
}
EvenNumbers <- function(x) {
x[which(is.even(x) & !is.na(x))]
}
x[-which(x > 0)] is not the same as x[x <= 0] because the first expression selects all elements of x except those that are greater than 0, whereas the second expression selects only the elements of x that are less than or equal to 0.
If you subset a vector with a positive integer that’s bigger than the length of the vector, R returns NA. If you subset with a name that doesn’t exist, R returns an error message indicating that the object doesn’t exist.
21.2.1 Exercises 1. Write for loops to: 1. Compute the mean of every column in mtcars.
columnMean <- colMeans(mtcars)
Types_of_columns <- sapply(nycflights13::flights, class)
UniqueCounts <- lapply(iris, function(x) length(unique(x)))
The_Mean <- c(-10, 0, 10, 100)
for (mean in The_Mean) {
RandomNormals <- rnorm(10, mean = mean)
print(RandomNormals)
}
## [1] -9.925340 -10.469569 -9.534925 -10.504695 -10.750347 -9.189290
## [7] -12.326101 -10.327889 -10.133558 -10.834610
## [1] 0.4761749 -1.6024989 -1.3324893 0.4350213 -1.2141491 -0.3773113
## [7] -0.7543699 -0.9888819 -1.7792520 -1.4190431
## [1] 12.563215 10.303310 10.422997 8.185336 10.593867 9.191544 9.632204
## [8] 8.020937 9.282840 10.664896
## [1] 98.65521 101.46032 100.45857 101.17687 99.79501 99.47962 100.14464
## [8] 99.07124 101.00859 100.44180
out <- ""
for (x in letters) {
out <- stringr::str_c(out, x)
}
#To concatenate all the elements of letters into a single string, use paste with the collapse argument:
out <- paste(letters, collapse = "")
x <- sample(100)
sd <- 0
for (i in seq_along(x)) {
sd <- sd + (x[i] - mean(x)) ^ 2
}
sd <- sqrt(sd / (length(x) - 1))
#To calculate the standard deviation of x, use sd:
sd <- sd(x)
x <- runif(100)
out <- vector("numeric", length(x))
out[1] <- x[1]
for (i in 2:length(x)) {
out[i] <- out[i - 1] + x[i]
}
#To cumulatively sum the elements of x, use cumsum:
out <- cumsum(x)
for (i in 5:2) {
cat(paste0(i, " humps on Alice the Camel\n"))
cat("She went through the desert with ease\n")
}
## 5 humps on Alice the Camel
## She went through the desert with ease
## 4 humps on Alice the Camel
## She went through the desert with ease
## 3 humps on Alice the Camel
## She went through the desert with ease
## 2 humps on Alice the Camel
## She went through the desert with ease
cat("Two humps on Alice the Camel\nShe's a pretty good camel!\n")
## Two humps on Alice the Camel
## She's a pretty good camel!
In addition, use the mtcars data set to perform hierarchical clustering on the cars with euclidean distance and Pearson correlation (use: dist.pear <- function(x) as.dist(1-cor(t(x))) as distance function). Next perform PCA and plot the scree plot.
library(dplyr)
# Calculate Euclidean distance
dist.euclid <- dist(mtcars)
# Calculate Pearson correlation distance
dist.pear <- function(x) as.dist(1-cor(t(x)))
# Perform hierarchical clustering using Euclidean distance
hc.euclid <- hclust(dist.euclid, method="ward.D2")
# Perform hierarchical clustering using Pearson correlation distance
hc.pear <- hclust(dist.pear(mtcars), method="ward.D2")
# Perform PCA on the mtcars data
pca.res <- prcomp(mtcars, scale=TRUE)
# Plot the scree plot
plot(pca.res$sdev, type="b", xlab="Principal Component", ylab="Standard Deviation", main="Scree Plot")
abline(h=0, col="gray", lty=2)
How many dimensions are reasonable in this dataset? From the Scree plot generated above, the appropriate number of dimensions in this dataset is 4. My conclusion is based on the observation that at PC = 4, the plot shows the point at which the decline in the standard deviation levels off (represented by the “elbow” in the plot), hence, the appropriate number of dimensions to keep.