Homework 4

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(googlesheets4)
library(dplyr)
library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

Go through the Program chapters 17 to 21 in R for Data Science by Garrett Grolemund, Hadley Wickham and do all the exercises that include programming. In addition, use the mtcars data set to perform hierarchical clustering on the cars with euclidean distance and Pearson correlation (use: dist.pear <- function(x) as.dist(1-cor(t(x))) as distance function). Next perform PCA and plot the scree plot. How many dimensions are reasonable in this dataset?

19.2.1 Exercises

Read the complete lyrics to “Little Bunny Foo Foo”. There’s a lot of duplication in this song. Extend the initial piping example to recreate the complete song, and use functions to reduce the duplication.

foo_foo <- data.frame(name = "Foo Foo")

hop_through <- function(x, forest) {
  x$location <- paste(x$name, "hopped through the", forest)
  return(x)
}

scoop_up <- function(x, field_mouse) {
  x$action <- paste(x$name, "scooped up the", field_mouse)
  return(x)
}

bop_on_head <- function(x, field_mouse) {
  x$action <- paste(x$action, "and bopped them on the head.")
  return(x)
}

down_came <- function(x, fairy) {
  x$arrival <- paste(fairy, "came down")
  return(x)
}

said <- function(x, message) {
  x$message <- paste(x$arrival, "and said", paste(message, collapse = " "))
  return(x)
}

give_chances <- function(from, to, number, condition, consequence) {
  message <- paste(from, "gave", to$name, number, "chances to change their ways.")
  message <- paste(message, "But", to$name, "did not listen and", condition, ".")
  message <- paste(message, "So", from, "turned", to$name, "into a", consequence, ".")
  return(message)
}

threat <- function(chances) {
  message <- give_chances(
    from = "Good Fairy",
    to = foo_foo,
    number = chances,
    condition = "didn't behave",
    consequence = "goon"
  )
  return(message)
}

lyric <- function() {
  foo_foo %>%
    hop_through(forest = "forest") %>%
    scoop_up(field_mouse = "field mice") %>%
    bop_on_head(field_mouse = "field mice")
  
  down_came(foo_foo, "Good Fairy")
  said(foo_foo, c(
    "Little bunny Foo Foo",
    "I don't want to see you",
    "Scooping up the field mice",
    "And bopping them on the head."
  ))
  
  return(foo_foo)
}

lyric()

##      name
## 1 Foo Foo

threat(3)

## [1] "Good Fairy gave Foo Foo 3 chances to change their ways. But Foo Foo did not listen and didn't behave . So Good Fairy turned Foo Foo into a goon ."

lyric()

##      name
## 1 Foo Foo

threat(2)

## [1] "Good Fairy gave Foo Foo 2 chances to change their ways. But Foo Foo did not listen and didn't behave . So Good Fairy turned Foo Foo into a goon ."

lyric()

##      name
## 1 Foo Foo

threat(1)

## [1] "Good Fairy gave Foo Foo 1 chances to change their ways. But Foo Foo did not listen and didn't behave . So Good Fairy turned Foo Foo into a goon ."

lyric()

##      name
## 1 Foo Foo

19.4.4 Exercises

Write a greeting function that says “good morning”, “good afternoon”, or “good evening”, depending on the time of day. (Hint: use a time argument that defaults to lubridate::now(). That will make it easier to test your function.)

Greetings <- function(time_of_the_day = lubridate::now()) {
  Current_hour <- lubridate::hour(time_of_the_day)
  
  if (Current_hour < 12) {
    "Good Morning"
  } else if (Current_hour < 18) {
    "Good Afternoon"
  } else {
    "Good Night"
  }
}

Implement a fizzbuzz function. It takes a single number as input. If the number is divisible by three, it returns “fizz”. If it’s divisible by five it returns “buzz”. If it’s divisible by three and five, it returns “fizzbuzz”. Otherwise, it returns the number. Make sure you first write working code before you create the function.

fizzbuzz<- function(k){
  
  if (k%%3 == 0 && k%%5==0){
    "fizzbuzz"
  } 
  else if (k%%5 == 0){
    "buzz"
  }
  else if (k%%3 ==0){
    "fizz"
  }
  else {
    k
  }
}
fizzbuzz(21)

## [1] "fizz"

fizzbuzz(20)

## [1] "buzz"

fizzbuzz(15)

## [1] "fizzbuzz"

fizzbuzz(2)

## [1] 2

How could you use cut() to simplify this set of nested if-else statements?

TemperatureCheck <- function(temp) 
{
      if (temp <= 0){"freezing"
      } else if (temp <= 10){
        "cold"
      }  else if (temp <= 20){
        "cool"
      }  else if (temp <= 30){
        "warm"
      }  else {
        "hot"
      }
}

TemperatureCheck <- function(temp)
cut(temp, c(-Inf, 0, 10, 20, 30, Inf), labels = c("freezing", "cold", "cool", "warm", "hot"))

How would you change the call to cut() if I’d used < instead of <=? What is the other chief advantage of cut() for this problem? (Hint: what happens if you have many values in temp?)

What happens if you use switch() with numeric values?

The switch statement can be used with numeric values, where it evaluates an expression and compares its value against each case label within the switch block. If the expression matches a case label, the code associated with that label will be executed. In case there is no match, the code associated with the default label (if provided) will be executed. If neither the expression matches any case label nor a default label is present, the switch statement will have no effect.

What does this switch() call do? What happens if x is “e”?

x= "a"
switch(x, 
  a = ,
  b = "ab",
  c =,
  d = "cd"
)

## [1] "ab"

#If the slot is empty, the switch() call moves to the next slot that contains values

19.5.5 Exercises 1. What does commas(letters, collapse = “-”) do? Why?

library("stringr")

commas<- function(letters, collapse = "-"){
  str_c(letters, collapse = "-")
}
commas(letters, collapse = "-")

## [1] "a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z"

#Running the code separates letters by `-`.

20.4.6 Exercises

Create functions that take a vector as input and returns:
The last value. Should you use [ or [[?

TheLastVaue <- function(x) {
  x[length(x)]
}
TheLastVaue

## function(x) {
##   x[length(x)]
## }

The elements at even numbered positions.

Even_numbered <- function(x) {
  x[seq(from = 2, to = length(x), by = 2)]
}
Even_numbered

## function(x) {
##   x[seq(from = 2, to = length(x), by = 2)]
## }

Every element except the last value.

LessLastVakue <- function(x) {
  head(x, -1)
}

Only even numbers (and no missing values).

EvenNumbers <- function(x) {
  x[which(is.even(x) & !is.na(x))]
}

Why is x[-which(x > 0)] not the same as x[x <= 0]?

x[-which(x > 0)] is not the same as x[x <= 0] because the first expression selects all elements of x except those that are greater than 0, whereas the second expression selects only the elements of x that are less than or equal to 0.

What happens when you subset with a positive integer that’s bigger than the length of the vector? What happens when you subset with a name that doesn’t exist?

If you subset a vector with a positive integer that’s bigger than the length of the vector, R returns NA. If you subset with a name that doesn’t exist, R returns an error message indicating that the object doesn’t exist.

21.2.1 Exercises 1. Write for loops to: 1. Compute the mean of every column in mtcars.

columnMean <- colMeans(mtcars)

Determine the type of each column in nycflights13::flights.

Types_of_columns <- sapply(nycflights13::flights, class)

Compute the number of unique values in each column of iris.

UniqueCounts <- lapply(iris, function(x) length(unique(x)))

Generate 10 random normals from distributions with means of -10, 0, 10, and 100.

The_Mean <- c(-10, 0, 10, 100)

for (mean in The_Mean) {
  RandomNormals <- rnorm(10, mean = mean)
  print(RandomNormals)
}

##  [1]  -9.925340 -10.469569  -9.534925 -10.504695 -10.750347  -9.189290
##  [7] -12.326101 -10.327889 -10.133558 -10.834610
##  [1]  0.4761749 -1.6024989 -1.3324893  0.4350213 -1.2141491 -0.3773113
##  [7] -0.7543699 -0.9888819 -1.7792520 -1.4190431
##  [1] 12.563215 10.303310 10.422997  8.185336 10.593867  9.191544  9.632204
##  [8]  8.020937  9.282840 10.664896
##  [1]  98.65521 101.46032 100.45857 101.17687  99.79501  99.47962 100.14464
##  [8]  99.07124 101.00859 100.44180

Eliminate the for loop in each of the following examples by taking advantage of an existing function that works with vectors:

out <- ""
for (x in letters) {
  out <- stringr::str_c(out, x)
}

#To concatenate all the elements of letters into a single string, use paste with the collapse argument:
out <- paste(letters, collapse = "")


x <- sample(100)
sd <- 0
for (i in seq_along(x)) {
  sd <- sd + (x[i] - mean(x)) ^ 2
}
sd <- sqrt(sd / (length(x) - 1))

#To calculate the standard deviation of x, use sd:
sd <- sd(x)

x <- runif(100)
out <- vector("numeric", length(x))
out[1] <- x[1]
for (i in 2:length(x)) {
  out[i] <- out[i - 1] + x[i]
}

#To cumulatively sum the elements of x, use cumsum:
out <- cumsum(x)

Combine your function writing and for loop skills:
Write a for loop that prints() the lyrics to the children’s song “Alice the camel”.

for (i in 5:2) {
  cat(paste0(i, " humps on Alice the Camel\n"))
  cat("She went through the desert with ease\n")
}

## 5 humps on Alice the Camel
## She went through the desert with ease
## 4 humps on Alice the Camel
## She went through the desert with ease
## 3 humps on Alice the Camel
## She went through the desert with ease
## 2 humps on Alice the Camel
## She went through the desert with ease

cat("Two humps on Alice the Camel\nShe's a pretty good camel!\n")

## Two humps on Alice the Camel
## She's a pretty good camel!

In addition, use the mtcars data set to perform hierarchical clustering on the cars with euclidean distance and Pearson correlation (use: dist.pear <- function(x) as.dist(1-cor(t(x))) as distance function). Next perform PCA and plot the scree plot.

library(dplyr)

# Calculate Euclidean distance
dist.euclid <- dist(mtcars)

# Calculate Pearson correlation distance
dist.pear <- function(x) as.dist(1-cor(t(x)))

# Perform hierarchical clustering using Euclidean distance
hc.euclid <- hclust(dist.euclid, method="ward.D2")

# Perform hierarchical clustering using Pearson correlation distance
hc.pear <- hclust(dist.pear(mtcars), method="ward.D2")

# Perform PCA on the mtcars data
pca.res <- prcomp(mtcars, scale=TRUE)

# Plot the scree plot
plot(pca.res$sdev, type="b", xlab="Principal Component", ylab="Standard Deviation", main="Scree Plot")
abline(h=0, col="gray", lty=2)

How many dimensions are reasonable in this dataset? From the Scree plot generated above, the appropriate number of dimensions in this dataset is 4. My conclusion is based on the observation that at PC = 4, the plot shows the point at which the decline in the standard deviation levels off (represented by the “elbow” in the plot), hence, the appropriate number of dimensions to keep.

Homework 4

Banabas Kariuki

2023-02-15