#1: Create a list “my_list” that contains two elements A = c(1:5, 7:3) and B = matrix(1:6, nrow=2)
my_list<-list(A=c(1:5,7:3), B=matrix(1:6, nrow=2)) #list contains element A and element B
#1A: Use lapply() to find the length for each of my_list’s elements.
length_list<-lapply(my_list,length) #my_list = list I want to examine, length = function that tells us the number of elements in each object (A & B)
length_list #object A has 10 elements, object B has 6 elements
## $A
## [1] 10
##
## $B
## [1] 6
#1B: Find the sum for each of my_list’s elements
sum_list<-lapply(my_list, sum) #my_list = list I want to examine, sum = function that tells us the value after adding up the elements in each object (A & B)
sum_list #the elements in Object A is summed up to 40; elements in Object B are summed up to 21
## $A
## [1] 40
##
## $B
## [1] 21
#2A: Use the apply() function to create boxplots for the first four columns of the “iris” dataset.
iris_columns<-iris[,1:4] #this helps us subset only the first 4 columns in the IRIS dataset
boxplot_iris<-apply(iris_columns, MARGIN = 2, boxplot) # a) iris_columns: dataset/array we are using, b) MARGIN = 2: indicates that the function will be applied to for each column only, boxplot: function we are applying to dataset
#boxplots in subsequent order: a) Sepal.Length, b) Sepal.Width, c) Petal.Length, d) Petal.Width
#2B: Do you observe any outliers for each variable? If so, remove the outliers and present boxplots again.
#the only boxplot with outliers is the Sepal.Width one. Hence, why are only focusing on the Sepal.Width boxplot for this question
outliers_iris<-boxplot.stats(iris_columns$Sepal.Width)$out #highlights the data points/observations that are outside the whiskers of the boxplot. In this case it is 4.4, 4.1, 4.2, 2.0
no_outliers<-iris_columns$Sepal.Width[!iris_columns$Sepal.Width%in%outliers_iris] #this removes the outliers from that specific column
no_outliers_plot<-boxplot(no_outliers) #this prints the boxplot without the outliers
#2C: Apply the shapiro.test() function using the apply() function to the first four columns of the “iris” dataset. According to the test results, which variables violate the normality assumption?
shapiro.test(iris_columns$Sepal.Length) #since the p-value is less than 0.05, this does NOT suggest that the data in the Sepal.Length differs from normal distribution (violates assumption of normality)
##
## Shapiro-Wilk normality test
##
## data: iris_columns$Sepal.Length
## W = 0.97609, p-value = 0.01018
shapiro.test(iris_columns$Sepal.Width) #since the p-value is *greater* than 0.05, this suggests that the data in the Sepal.Width does not significantly differ from normal distribution (maintains assumption of normality)
##
## Shapiro-Wilk normality test
##
## data: iris_columns$Sepal.Width
## W = 0.98492, p-value = 0.1012
shapiro.test(iris_columns$Petal.Length) #since the p-value is less than 0.05, this does NOT suggest that the data in the Petal.Length differs from normal distribution (violates assumption of normality)
##
## Shapiro-Wilk normality test
##
## data: iris_columns$Petal.Length
## W = 0.87627, p-value = 7.412e-10
shapiro.test(iris_columns$Petal.Width) #since the p-value is less than 0.05, this does NOT suggest that the data in the Petal.Width differs from normal distribution (violates assumption of normality)
##
## Shapiro-Wilk normality test
##
## data: iris_columns$Petal.Width
## W = 0.90183, p-value = 1.68e-08
#2D: For the non-normal variables identified in Question C, use the plot_normality() function in the dlookr package to assess whether log and square-root transformations resolve the non-normality issue.
install.packages(“dlookr”)
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
## method from
## plot.transform scales
## print.transform scales
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
plot_normality(iris_columns, Sepal.Length) #appears that log *and* square root transformations resolves normality-problems; data appears to be normally distributed on those graphs
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the dlookr package.
## Please report the issue at <https://github.com/choonghyunryu/dlookr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_normality(iris_columns, Petal.Length) #appears that NEITHER square root or log transformation resolve non-normality issue; data does NOT appear to be normally distributed on either of these graphs
plot_normality(iris_columns, Petal.Width) #appears that NEITHER square root or log transformation resolve non-normality issue; data does NOT appear to be normally distributed on either of these graphs
#3A: Use the aggr() function in the VIM package to assess the rate and pattern of missing data.
install.packages(“VIM”) install.packages(“mice”)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
data_nhanes<-nhanes
aggr(data_nhanes, col=c('navyblue','red'), numbers=TRUE, sortVars = TRUE, labels=names(data), cex.axis=.7, gap=3,
ylab=c("Histogram of missing data", "Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## chl 0.40
## bmi 0.36
## hyp 0.32
## age 0.00
#the first graph tells us that there is missing data in the following columns: CHL (40% of entries in column), BMI (36% of entries in column), HYP (34% of entries in column). There is no missing data in the AGE column (0%)
#the second graph tells us that the combinatination of missing data in each row or subject. For example a) 4% of the subjects/rows are missing data in the BMI & HYP columns, b) 4% of subjects/rows are only missing data in the BMI column, c) 12% of subjects/rows are missing data in the CHL columns, and d) 28% of subjects/rows are missing data in the CHL, BMI, and HYP columns. By contrast, 52% of subjects/rows are not missing any data.
#3B: Use the mice() function to impute missing data for 10 items (set the imputation method to pmm and the seed to 123).
imputed_data<-mice(data_nhanes, m=10, method = 'pmm', seed = 123)
##
## iter imp variable
## 1 1 bmi hyp chl
## 1 2 bmi hyp chl
## 1 3 bmi hyp chl
## 1 4 bmi hyp chl
## 1 5 bmi hyp chl
## 1 6 bmi hyp chl
## 1 7 bmi hyp chl
## 1 8 bmi hyp chl
## 1 9 bmi hyp chl
## 1 10 bmi hyp chl
## 2 1 bmi hyp chl
## 2 2 bmi hyp chl
## 2 3 bmi hyp chl
## 2 4 bmi hyp chl
## 2 5 bmi hyp chl
## 2 6 bmi hyp chl
## 2 7 bmi hyp chl
## 2 8 bmi hyp chl
## 2 9 bmi hyp chl
## 2 10 bmi hyp chl
## 3 1 bmi hyp chl
## 3 2 bmi hyp chl
## 3 3 bmi hyp chl
## 3 4 bmi hyp chl
## 3 5 bmi hyp chl
## 3 6 bmi hyp chl
## 3 7 bmi hyp chl
## 3 8 bmi hyp chl
## 3 9 bmi hyp chl
## 3 10 bmi hyp chl
## 4 1 bmi hyp chl
## 4 2 bmi hyp chl
## 4 3 bmi hyp chl
## 4 4 bmi hyp chl
## 4 5 bmi hyp chl
## 4 6 bmi hyp chl
## 4 7 bmi hyp chl
## 4 8 bmi hyp chl
## 4 9 bmi hyp chl
## 4 10 bmi hyp chl
## 5 1 bmi hyp chl
## 5 2 bmi hyp chl
## 5 3 bmi hyp chl
## 5 4 bmi hyp chl
## 5 5 bmi hyp chl
## 5 6 bmi hyp chl
## 5 7 bmi hyp chl
## 5 8 bmi hyp chl
## 5 9 bmi hyp chl
## 5 10 bmi hyp chl
#mice -> allows us to start imputation on nhanes dataset
#m = 10 -> we want to create 10 datasets with missing data filled in (i.e., imputed)
#method = pmm -> allows us to use predictive mean matching as a way to impute data
#seed = 123 --> keeps keep randomized data reproducible
#3C: Extract and print out the 10th imputed data set.
imputed_10<-complete(imputed_data, 10) # this helps us extract the 10th imputed data set
print(imputed_10) #in case one wanted to view the 10th imputed dataset
## age bmi hyp chl
## 1 1 29.6 1 229
## 2 2 22.7 1 187
## 3 1 27.2 1 187
## 4 3 27.4 1 186
## 5 1 20.4 1 113
## 6 3 21.7 2 184
## 7 1 22.5 1 118
## 8 1 30.1 1 187
## 9 2 22.0 1 238
## 10 2 25.5 1 204
## 11 1 29.6 1 184
## 12 2 27.5 2 131
## 13 3 21.7 1 206
## 14 2 28.7 2 204
## 15 1 29.6 1 204
## 16 1 25.5 1 118
## 17 3 27.2 2 284
## 18 2 26.3 2 199
## 19 1 35.3 1 218
## 20 3 25.5 2 218
## 21 1 22.0 1 187
## 22 1 33.2 1 229
## 23 1 27.5 1 131
## 24 3 24.9 1 218
## 25 2 27.4 1 186