# Patrick O'Brien | 2024-09-05| BIOSTAT 521 | Lab 0
install.packages(“tidyverse”) library (tidyverse) setwd(“C:/Users/paob/Documents/Offline Work/R/Lab0”) DATA = read_csv(“EPID_521_lab_data.csv”, na = “NA”) view(DATA)
# Plot the Age variable using a histogram, use the main and xlab options to
# provide names for the figure and the x-axis, respectively
hist(DATA$RIDAGEYR, main = “Histogram of Age Variable”, xlab = “Age”)
# Plot the Age variable using a Boxplot
boxplot(DATA\(RIDAGEYR) mean(DATA\)RIDAGEYR, na.rm = TRUE) sd(DATA\(RIDAGEYR, na.rm = TRUE) summary(DATA\)RIDAGEYR)
# create a table of counts
table(DATA$RIAGENDR)
# create a table of proportions
prop.table(table(DATA$RIAGENDR))
# create a barplot
barplot(table(DATA\(RIAGENDR), col = c("red", "blue")) table(DATA\)DMDEDUC2) barplot(table(DATA$DMDEDUC2), cex.names = 0.7)
# The `cex.names` argument in the `barplot` function controls how large to print the names of the levels under the bars. I have made them a little smaller than the default so they all fit nicely
# order the levels of the education variable
DATA\(DMDEDUC2 = ordered(DATA\)DMDEDUC2, levels = c(“NoHighSchool”, “SomeHighSchool”, “HighSchool”, “SomeCollege”, “College”)) table(DATA\(DMDEDUC2) barplot(table(DATA\)DMDEDUC2), cex.names = 0.9)
# rename the Age variable
DATA = rename(DATA, Age = RIDAGEYR)
# create a new variable Age50 to indicate which samples are aged 50 and above
DATA = mutate(DATA, Age50 = ifelse(Age >= 50, 1, 0))
# count the number of sample 50 and above
table(DATA\(Age50) prop.table(table(DATA\)race))
# 1. Each dataset contains a variable for self-reported race. Rename that
# variable to simply race. Is there any need to order the levels of
# the race variable? Why?
DATA = rename(DATA, race = RIDRETH1)
# There is no need to order the levels, as they are categorical nominal,
# meaning there is no natural order between the values.
# 2. How many levels are included in the race variable? What are the
# proportions for each group in your dataset?
PT <- prop.table(table(DATA$race)) barplot(PT, cex.names=0.55, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main=“Proportion of Study Respondents by Race”, xlab=“Race”, ylab=“Proportion of Total Respondents”, ylim=c(0, 0.5), )
# There are 5 levels
#
# Black: 0.22921790
# MexicanAmerican: 0.23954747
# Other: 0.04230202
# OtherHispanic: 0.03885883
# White: 0.45007378
# 3. Compute the mean, standard deviation and Five-Number Summary for
# the BMI variable in your dataset.
DATA = rename(DATA, BMI = BMXBMI) mean(DATA\(BMI, na.rm=TRUE) sd(DATA\)BMI, na.rm=TRUE) fivenum(DATA$BMI)
# 4. Based on the descriptive statistics for BMI, do you have any concerns
# about potential outlier values?
#Yes. The maximum appears quite large. This is worth investigating.
(130.21-29.25318)/7.204896
#Max-Average/SD = Deviations outside mean
#The largest number is over fourteen standard deviations away from the mean,
#which is outside of an expected range. To test this, we will check the
#highest value against the interquartile range.
IQR(DATA$BMI, na.rm=TRUE) 32.29+1.5*7.7
#The above formula equals 43.84
#130.21 is well outside of the given range for identifying outliers, and
#therefore 130.21 can be considered a #statistical outlier. It is not crazy to
#assume that the decimal was entered incorrectly, as 13.021 is a relatively
#low but reasonable BMI.
# 5. Based on the descriptive statistics for BMI, do you think the
# distribution for BMI is most likely to be left skewed, right
# skewed or symmetric? Why?
#Right skewed. In this instance, mean is greater than the median
#(29.25318>27.83). This usually indicates right skew.
# 6. Confirm your answer to the above question by creating an
# appropriate plot to visualize the distribution of BMI.
hist(DATA$BMI, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main=“Histogram of Body Mass Index”, xlab=“BMI”, xlim=c(0,140), ylab=“Frequency”, ylim=c(0, 1250), )
#This histogram confirms the statistical hypothesis of right skew.
#7. Create a new variable called LogAge containing the natural
# logarithm of the Age variable. (HINT: The log function computes
# the natural log, logAge=log(Age) )
DATA = mutate(DATA, LogAge = log(DATA$Age))
#8. What does the distribution of your new variable logAge?
# (HINT: Create a histogram or boxplot of the variable you created
# in the previous step.) Compare this to the shape of the original
# Age variable? That is, how did applying the natural log function
# change the shape of the distribution of ages in the dataset.
hist(DATA$LogAge, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main=“Histogram of Natural Log of Age”, xlab=“Natural Log of Age”, ylab=“Frequency”, )
fivenum(DATA\(LogAge) mean(DATA\)LogAge, na.rm=TRUE) sd(DATA$LogAge, na.rm=TRUE)
hist(DATA\(Age, cex.main=1, #change font size of title cex.lab=0.75, #change font size of axis labels col=rgb(0.2,0.4,0.6,0.6), main="Histogram of Age", xlab="Age", ylab="Frequency", ) fivenum(DATA\)Age) mean(DATA\(Age, na.rm=TRUE) sd(DATA\)Age, na.rm=TRUE)
#In this data set, "age" is bimodal right skewed, as indicated in both the
#histogram (visibly shown) and the statistics (46.71864>44; mean>median). The
#natural log of age adjusts the data by converting to a unimodal distribution
#(if any modality at all) and by adjusting the skew to slightly left skewed
#(3.760848<3.784190; mean<median).
###
#9. Suppose that you are interested in designing a study with
# individuals aged 65 and above. How many such samples in your
# dataset? (HINT: create a new variable to identify samples 65+
# and use a table to count.
DATA = mutate(DATA, Age65 = ifelse(Age >= 65, 1, 0)) table(DATA$Age65)
# 0 1
#1606 427
#427 individuals would fit this criteria of 65 years old or above.