# Patrick O'Brien | 2024-09-05| BIOSTAT 521 | Lab 0

install.packages(“tidyverse”) library (tidyverse) setwd(“C:/Users/paob/Documents/Offline Work/R/Lab0”) DATA = read_csv(“EPID_521_lab_data.csv”, na = “NA”) view(DATA)

# Plot the Age variable using a histogram, use the main and xlab options to
# provide names for the figure and the x-axis, respectively

hist(DATA$RIDAGEYR, main = “Histogram of Age Variable”, xlab = “Age”)

# Plot the Age variable using a Boxplot

boxplot(DATA\(RIDAGEYR) mean(DATA\)RIDAGEYR, na.rm = TRUE) sd(DATA\(RIDAGEYR, na.rm = TRUE) summary(DATA\)RIDAGEYR)

# create a table of counts

table(DATA$RIAGENDR)

# create a table of proportions

prop.table(table(DATA$RIAGENDR))

# create a barplot

barplot(table(DATA\(RIAGENDR), col = c("red", "blue")) table(DATA\)DMDEDUC2) barplot(table(DATA$DMDEDUC2), cex.names = 0.7)

# The `cex.names` argument in the `barplot` function controls how large to print the names of the levels under the bars. I have made them a little smaller than the default so they all fit nicely
# order the levels of the education variable

DATA\(DMDEDUC2 = ordered(DATA\)DMDEDUC2, levels = c(“NoHighSchool”, “SomeHighSchool”, “HighSchool”, “SomeCollege”, “College”)) table(DATA\(DMDEDUC2) barplot(table(DATA\)DMDEDUC2), cex.names = 0.9)

# rename the Age variable

DATA = rename(DATA, Age = RIDAGEYR)

# create a new variable Age50 to indicate which samples are aged 50 and above

DATA = mutate(DATA, Age50 = ifelse(Age >= 50, 1, 0))

# count the number of sample 50 and above

table(DATA\(Age50) prop.table(table(DATA\)race))

# 1. Each dataset contains a variable for self-reported race. Rename that
#     variable to simply race. Is there any need to order the levels of
#     the race variable? Why?

DATA = rename(DATA, race = RIDRETH1)

# There is no need to order the levels, as they are categorical nominal,
# meaning there is no natural order between the values.

# 2. How many levels are included in the race variable? What are the
#     proportions for each group in your dataset?

PT <- prop.table(table(DATA$race)) barplot(PT, cex.names=0.55, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main=“Proportion of Study Respondents by Race”, xlab=“Race”, ylab=“Proportion of Total Respondents”, ylim=c(0, 0.5), )

# There are 5 levels
#
# Black: 0.22921790
# MexicanAmerican: 0.23954747
# Other: 0.04230202
# OtherHispanic: 0.03885883
# White: 0.45007378  
 
  
# 3. Compute the mean, standard deviation and Five-Number Summary for
#     the BMI variable in your dataset.

DATA = rename(DATA, BMI = BMXBMI) mean(DATA\(BMI, na.rm=TRUE) sd(DATA\)BMI, na.rm=TRUE) fivenum(DATA$BMI)

# 4. Based on the descriptive statistics for BMI, do you have any concerns
#     about potential outlier values?

#Yes. The maximum appears quite large. This is worth investigating.

(130.21-29.25318)/7.204896

#Max-Average/SD = Deviations outside mean

#The largest number is over fourteen standard deviations away from the mean,
#which is  outside of an expected range. To test this, we will check the 
#highest value against the interquartile range.

IQR(DATA$BMI, na.rm=TRUE) 32.29+1.5*7.7

#The above formula equals 43.84

#130.21 is well outside of the given range for identifying outliers, and
#therefore 130.21 can be considered a #statistical outlier. It is not crazy to
#assume that the decimal was entered incorrectly, as 13.021 is a relatively
#low but reasonable BMI.

# 5. Based on the descriptive statistics for BMI, do you think the
#     distribution for BMI is most likely to be left skewed, right
#     skewed or symmetric? Why?

#Right skewed. In this instance, mean is greater than the median
#(29.25318>27.83). This usually indicates right skew.

# 6. Confirm your answer to the above question by creating an
#     appropriate plot to visualize the distribution of BMI.

hist(DATA$BMI, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main=“Histogram of Body Mass Index”, xlab=“BMI”, xlim=c(0,140), ylab=“Frequency”, ylim=c(0, 1250), )

#This histogram confirms the statistical hypothesis of right skew.

#7. Create a new variable called LogAge containing the natural
#   logarithm of the Age variable. (HINT: The log function computes
#   the natural log, logAge=log(Age) )

DATA = mutate(DATA, LogAge = log(DATA$Age))

#8. What does the distribution of your new variable logAge?
#   (HINT: Create a histogram or boxplot of the variable you created
#   in the previous step.) Compare this to the shape of the original
#   Age variable? That is, how did applying the natural log function
#   change the shape of the distribution of ages in the dataset.

hist(DATA$LogAge, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main=“Histogram of Natural Log of Age”, xlab=“Natural Log of Age”, ylab=“Frequency”, )

fivenum(DATA\(LogAge) mean(DATA\)LogAge, na.rm=TRUE) sd(DATA$LogAge, na.rm=TRUE)

hist(DATA\(Age, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main="Histogram of Age", xlab="Age", ylab="Frequency", ) fivenum(DATA\)Age) mean(DATA\(Age, na.rm=TRUE) sd(DATA\)Age, na.rm=TRUE)

#In this data set, "age" is bimodal right skewed, as indicated in both the
#histogram (visibly shown) and the statistics (46.71864>44; mean>median). The
#natural log of age adjusts the data by converting to a unimodal distribution 
#(if any modality at all) and by adjusting the skew to slightly left skewed
#(3.760848<3.784190; mean<median).

###

#9. Suppose that you are interested in designing a study with
#   individuals aged 65 and above. How many such samples in your
#   dataset? (HINT: create a new variable to identify samples 65+
#   and use a table to count.

DATA = mutate(DATA, Age65 = ifelse(Age >= 65, 1, 0)) table(DATA$Age65)

#   0    1 
#1606  427 
#427 individuals would fit this criteria of 65 years old or above.