#1: Create a list “my_list” that contains two elements A = c(1:5, 7:3) and B = matrix(1:6, nrow=2)

my_list<-list(A=c(1:5,7:3), B=matrix(1:6, nrow=2)) #list contains element A and element B

#1A: Use lapply() to find the length for each of my_list’s elements.

length_list<-lapply(my_list,length) #my_list = list I want to examine, length = function that tells us the number of elements in each object (A & B)

length_list #object A has 10 elements, object B has 6 elements
## $A
## [1] 10
## 
## $B
## [1] 6

#1B: Find the sum for each of my_list’s elements

sum_list<-lapply(my_list, sum) #my_list = list I want to examine, sum = function that tells us the value after adding up the elements in each object (A & B)

sum_list #the elements in Object A is summed up to 40; elements in Object B are summed up to 21
## $A
## [1] 40
## 
## $B
## [1] 21

#2A: Use the apply() function to create boxplots for the first four columns of the “iris” dataset.

iris_columns<-iris[,1:4] #this helps us subset only the first 4 columns in the IRIS dataset

boxplot_iris<-apply(iris_columns, MARGIN = 2, boxplot) # a) iris_columns:  dataset/array we are using, b) MARGIN = 2: indicates that the function will be applied to for each column only, boxplot: function we are applying to dataset

#boxplots in subsequent order: a) Sepal.Length, b) Sepal.Width, c) Petal.Length, d) Petal.Width

#2B: Do you observe any outliers for each variable? If so, remove the outliers and present boxplots again.

#the only boxplot with outliers is the Sepal.Width one. Hence, why are only focusing on the Sepal.Width boxplot for this question

outliers_iris<-boxplot.stats(iris_columns$Sepal.Width)$out #highlights the data points/observations that are outside the whiskers of the boxplot. In this case it is 4.4, 4.1, 4.2, 2.0

no_outliers<-iris_columns$Sepal.Width[!iris_columns$Sepal.Width%in%outliers_iris] #this removes the outliers from that specific column

no_outliers_plot<-boxplot(no_outliers) #this prints the boxplot without the outliers

#2C: Apply the shapiro.test() function using the apply() function to the first four columns of the “iris” dataset. According to the test results, which variables violate the normality assumption?

shapiro.test(iris_columns$Sepal.Length) #since the p-value is less than 0.05, this does NOT suggest that the data in the Sepal.Length differs from normal distribution (violates assumption of normality)
## 
##  Shapiro-Wilk normality test
## 
## data:  iris_columns$Sepal.Length
## W = 0.97609, p-value = 0.01018
shapiro.test(iris_columns$Sepal.Width) #since the p-value is *greater* than 0.05, this suggests that the data in the Sepal.Width does not significantly differ from normal distribution (maintains assumption of normality)
## 
##  Shapiro-Wilk normality test
## 
## data:  iris_columns$Sepal.Width
## W = 0.98492, p-value = 0.1012
shapiro.test(iris_columns$Petal.Length) #since the p-value is less than 0.05, this does NOT suggest that the data in the Petal.Length differs from normal distribution (violates assumption of normality)
## 
##  Shapiro-Wilk normality test
## 
## data:  iris_columns$Petal.Length
## W = 0.87627, p-value = 7.412e-10
shapiro.test(iris_columns$Petal.Width) #since the p-value is less than 0.05, this does NOT suggest that the data in the Petal.Width differs from normal distribution (violates assumption of normality)
## 
##  Shapiro-Wilk normality test
## 
## data:  iris_columns$Petal.Width
## W = 0.90183, p-value = 1.68e-08

#2D: For the non-normal variables identified in Question C, use the plot_normality() function in the dlookr package to assess whether log and square-root transformations resolve the non-normality issue.

install.packages(“dlookr”)

library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
plot_normality(iris_columns, Sepal.Length) #appears that log *and* square root transformations resolves normality-problems; data appears to be normally distributed on those graphs
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the dlookr package.
##   Please report the issue at <https://github.com/choonghyunryu/dlookr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot_normality(iris_columns, Petal.Length) #appears that NEITHER square root or log transformation resolve non-normality issue; data does NOT appear to be normally distributed on either of these graphs

plot_normality(iris_columns, Petal.Width) #appears that NEITHER square root or log transformation resolve non-normality issue; data does NOT appear to be normally distributed on either of these graphs

#3A: Use the aggr() function in the VIM package to assess the rate and pattern of missing data.

install.packages(“VIM”) install.packages(“mice”)

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
data_nhanes<-nhanes 

aggr(data_nhanes, col=c('navyblue','red'), numbers=TRUE, sortVars = TRUE, labels=names(data), cex.axis=.7, gap=3,
     ylab=c("Histogram of missing data", "Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable Count
##       chl  0.40
##       bmi  0.36
##       hyp  0.32
##       age  0.00
#the first graph tells us that there is missing data in the following columns: CHL (40% of entries in column), BMI (36% of entries in column), HYP (34% of entries in column). There is no missing data in the AGE column (0%)


#the second graph tells us that the combinatination of missing data in each row or subject. For example a) 4% of the subjects/rows are missing data in the BMI & HYP columns, b) 4% of subjects/rows are only missing data in the BMI column, c) 12% of subjects/rows are missing data in the CHL columns, and d) 28% of subjects/rows are missing data in the CHL, BMI, and HYP columns. By contrast, 52% of subjects/rows are not missing any data. 

#3B: Use the mice() function to impute missing data for 10 items (set the imputation method to pmm and the seed to 123).

imputed_data<-mice(data_nhanes, m=10, method = 'pmm', seed = 123) 
## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   1   6  bmi  hyp  chl
##   1   7  bmi  hyp  chl
##   1   8  bmi  hyp  chl
##   1   9  bmi  hyp  chl
##   1   10  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   2   6  bmi  hyp  chl
##   2   7  bmi  hyp  chl
##   2   8  bmi  hyp  chl
##   2   9  bmi  hyp  chl
##   2   10  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   3   6  bmi  hyp  chl
##   3   7  bmi  hyp  chl
##   3   8  bmi  hyp  chl
##   3   9  bmi  hyp  chl
##   3   10  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   4   6  bmi  hyp  chl
##   4   7  bmi  hyp  chl
##   4   8  bmi  hyp  chl
##   4   9  bmi  hyp  chl
##   4   10  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl
##   5   6  bmi  hyp  chl
##   5   7  bmi  hyp  chl
##   5   8  bmi  hyp  chl
##   5   9  bmi  hyp  chl
##   5   10  bmi  hyp  chl
#mice -> allows us to start imputation on nhanes dataset
#m = 10 -> we want to create 10 datasets with missing data filled in (i.e., imputed)
#method = pmm -> allows us to use predictive mean matching as a way to impute data
#seed = 123 --> keeps keep randomized data reproducible

#3C: Extract and print out the 10th imputed data set.

imputed_10<-complete(imputed_data, 10) # this helps us extract the 10th imputed data set 

print(imputed_10) #in case one wanted to view the 10th imputed dataset
##    age  bmi hyp chl
## 1    1 29.6   1 229
## 2    2 22.7   1 187
## 3    1 27.2   1 187
## 4    3 27.4   1 186
## 5    1 20.4   1 113
## 6    3 21.7   2 184
## 7    1 22.5   1 118
## 8    1 30.1   1 187
## 9    2 22.0   1 238
## 10   2 25.5   1 204
## 11   1 29.6   1 184
## 12   2 27.5   2 131
## 13   3 21.7   1 206
## 14   2 28.7   2 204
## 15   1 29.6   1 204
## 16   1 25.5   1 118
## 17   3 27.2   2 284
## 18   2 26.3   2 199
## 19   1 35.3   1 218
## 20   3 25.5   2 218
## 21   1 22.0   1 187
## 22   1 33.2   1 229
## 23   1 27.5   1 131
## 24   3 24.9   1 218
## 25   2 27.4   1 186