#Lecture 3 Questions

Is the above plot informative? What will you do to make it more informative?

library(RCurl)
## Loading required package: bitops
URL_text_1 <- "https://raw.githubusercontent.com/kannan-kasthuri/kannan-kasthuri.github.io"
URL_text_2 <- "/master/Datasets/HANES/NYC_HANES_DIAB.csv"

URL <- paste(URL_text_1,URL_text_2, sep="")
HANES <- read.csv(text=getURL(URL))

HANES$GENDER <- as.factor(HANES$GENDER)
HANES$AGEGROUP <- as.factor(HANES$AGEGROUP)
HANES$HSQ_1 <- as.factor(HANES$HSQ_1)

#this plot is not informative in anyway because the axes are not labeled and there is no title; if you were given just this plot you would have no idea what data it is showing
plot(HANES$GENDER)

#we would want to first take the index of gender and give them specific lables of male and female; after that we can add a title and a y=axis label to show what the numbers are supposed to mean
HANES$GENDER <- factor(HANES$GENDER, labels=c("M","F"))
plot(HANES$GENDER, main = 'Gender Distribution', ylab = 'number of people')

Find the distribution of A1C for the female population in the above data set. Are they different? Add vertical lines that indicate the boundaries of the standard error of the mean. Find the distribution of A1C for three age groups in the above data set. Is there a difference? Try to find the distribution of one more numeric variable (other than A1C) for the three age-groups. Try some plots with a higher number of bins in the above exercise, what happens? #(smaller intervals???)

# the distribution for females is similar to that of males'; they are both positively skewed
HANES_FEMALE <- HANES$GENDER == "F"
FEMALES_DF <- HANES[HANES_FEMALE,]
hist(FEMALES_DF$A1C)

# this line is to manually calculate standard error for FEMALES_DF$A1c
se = (sd(FEMALES_DF$A1C, na.rm = TRUE) / sqrt(length(FEMALES_DF$A1C[!is.na(FEMALES_DF$A1C)])))

# the following three lines represent the mean with the standard error bars
abline(v = mean(FEMALES_DF$A1C, na.rm = T), col="red")
abline(v = (mean(FEMALES_DF$A1C, na.rm = T) + se), col="blue")
abline(v = (mean(FEMALES_DF$A1C, na.rm = T) - se), col="blue")

# these are the histograms for the three age groups of females' A1c values; the distribution appears to shift right with higher successive age groups
FEMALES_AGE1 <- FEMALES_DF$AGEGROUP == "1"
FEMALESAGE1_DF <- FEMALES_DF[FEMALES_AGE1,]
hist(FEMALESAGE1_DF$A1C)

FEMALES_AGE2 <- FEMALES_DF$AGEGROUP == "2"
FEMALESAGE2_DF <- FEMALES_DF[FEMALES_AGE2,]
hist(FEMALESAGE2_DF$A1C)

FEMALES_AGE3 <- FEMALES_DF$AGEGROUP == "3"
FEMALESAGE3_DF <- FEMALES_DF[FEMALES_AGE3,]
hist(FEMALESAGE3_DF$A1C)

# this is the distribution of HDL for the three age groups of females
hist(FEMALESAGE1_DF$HDL)

hist(FEMALESAGE2_DF$HDL)

hist(FEMALESAGE3_DF$HDL)

# this graph just shows higher number of bins or breaks
hist(FEMALES_DF$A1C, breaks = 25)

Check the Hmisc::label() function. In accordance to the graph above, think how one can leverage this function to save some typing when plotting several graphs with the same variable? Give an example.

# this will label the HANES$GLUCOSE column with the argument we input; this helps to save time because we don't have to relabel the same variable again
Hmisc::label(HANES$GLUCOSE) <- "Plasma Glucose [mg/dL]"
str(HANES$GLUCOSE)
##  'labelled' int [1:1527] 83 81 86 93 90 92 85 72 87 96 ...
##  - attr(*, "label")= chr "Plasma Glucose [mg/dL]"

Change the type to “l” and report the plot type.

# this plots the two variables as just a line instead of showing each individual point
plot(HANES$GLUCOSE, HANES$GLUCOSESI, 
       xlab= "Plasma Glucose [mg/dL]", ylab = expression(paste("Blood Glucose SI units [", mu, "mole/L]")), 
       main = "Plasma vs Blood Glucose", type = "l", col="blue")

Do the above exercise with “mfcol” argument. How does it plot?

# the plots are plotted in column order with the mfcol argument
par(mfcol = c(2,2))
plot(HANES$LDL, HANES$HDL)
plot(HANES$A1C, HANES$HDL)
plot(HANES$GLUCOSE, HANES$HDL)
plot(HANES$CHOLESTEROLTOTAL, HANES$HDL)

Make a plot and add elements through the functions points(), lines(), segments() and text().

newplot_a <- c(10:30)
newplot_b <- c(30:50)
plot(newplot_a, newplot_b, main = 'Random Plot')
points(14, 60, col = 'orange', pch = 17)
line_x <- 15:25
line_y <- 2*line_x
lines(line_x, line_y, col = 'blue')
segments(x0 = 15, y0 = 45, x1 = 20, y1 = 30, col = 'red')
text(25, 35, 'hello', col = 'green')

#Lecture 4 Questions

Change x to 5 and re-run the above code. What does that print?

# nothing will be printed because there was no condition for what would happen if x > 0
x <- 5
if (x < 0) {
  print("x is a negative number")
}

Repeat the previous classwork, this time adding an else statement that prints “x is positive or zero”, when needed.

x <- 5
if (x < 0) {
  print("x is a negative number")
} else {
   print("x is positive or zero")
}  
## [1] "x is positive or zero"

Create an R script that returns the max value of a vector x with length 3. Don’t use the aid of an auxiliary variable.

vec_1 <- c(15, 3, 12)
max(vec_1)
## [1] 15

Compare two vectors of equal length and output the result. How about vectors of unequal length? Why does the result makes sense? Compare two matrices and lists and explain how R handles such comparisons. Explore further the example of the “inclusion operator”, does it matter if you reverse the order of the compared elements? If yes, what the difference is?

test_1 <- c(1:5)
test_2 <- c(5:1)
test_1 > test_2
## [1] FALSE FALSE FALSE  TRUE  TRUE
# for vectors of unequal length, the shorter one is recycled
test_3 <- c(1:6)
test_3 > test_2
## Warning in test_3 > test_2: longer object length is not a multiple of
## shorter object length
## [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE
# you cannot use the comparison operator on lists
matrix_1 <- matrix(c(1, 2, 'c', 4, 5, 6, 7, 'd', 9), nrow=3)
matrix_2 <- matrix(c(9, 8, 7, 6, 'a', 'b', 3, 2, 1), nrow=3)
matrix_1 < matrix_2
##       [,1] [,2]  [,3]
## [1,]  TRUE TRUE FALSE
## [2,]  TRUE TRUE FALSE
## [3,] FALSE TRUE FALSE
list_1 <- list('a', 2, 'hello')
list_2 <- list('b', 1, 'world')
list_1 < list_2
## Error in list_1 < list_2: comparison of these types is not implemented
# in the case of the inclusion operator, the order matters as shown here since some of the TRUE nad FALSE are reversed
matrix_1 %in% matrix_2
## [1]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE
matrix_2 %in% matrix_1
## [1]  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE

What does this function do?

# this function looks at the input x and says that if it is negative to output -x but if it is positive or zero to just ouptut x; it is essentially absolute value
f <- function(x) {
        if (x<0) {
            -x
        } else {
             x
          }
}

Use rm(x) in the above code to remove x from the global environment and report what happens when you call the function.

x <- 10
f <- function() {
  y <- 3
  c(x,y)
}

rm(x)
# there is an error that says the object 'x' is not found after you removed it
f()
## Error in f(): object 'x' not found

How do we get to advance a in 1 at each call?

# you can input '10' into 'a' so that the function will add '1'; to reset it you can simply just rm(a) from the global environment
f <- function() {
  if (!exists("a")) {
    a <- 10
  } else {
    a <- a + 1
  }
  print(a)
}
a <- f()
## [1] 10
a <- f()
## [1] 11
a <- f()
## [1] 12
rm(a)

How do you test for NA in a vector?

na_vector <- c(NA, 2, 17, NA, 5)
is.na(na_vector)
## [1]  TRUE FALSE FALSE  TRUE FALSE