Loading necessary libraries:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(naniar)
library(readr)
library(ggplot2)
library(broom)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
Loading the Pima Indians Diabetes dataset:
diab_data <- read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", col_names= FALSE)
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): X1, X2, X3, X4, X5, X6, X7, X8, X9
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(diab_data) <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome")
head(diab_data)
## # A tibble: 6 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
View(diab_data)
Checking for missing values:
pct_complete(diab_data)
## [1] 100
this result indicates that we do not have any missing values.
Let’s assume that the columns Glucose and BloodPressure in our data correspond to cholesterol levels of patients before and after performing a hypothecial inervention. Performing the Wilcoxon-signed rank test for paired differences of cholesterol levels before and after the hypothetical intervention:
wilcox.test(diab_data$Glucose, diab_data$BloodPressure, paired=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: diab_data$Glucose and diab_data$BloodPressure
## V = 289180, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
We get a very small number for the p-value, which indicates that 2 groups (before and after the hypothetical intervention) are in fact statistically different.
Next, let’s load one of the built in datasets of R to perform ANOVA analysis.
data("Boston")
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# Converting rad to a factor (categorical) for proper grouping in the plot
Boston$rad <- factor(Boston$rad)
#performing a test to see if mean age is different in rad categories
anova_result <- aov(age ~ rad, data = Boston)
## Creating a boxplot with individual points overlaid
ggplot(Boston, aes(x = rad, y = age, fill = rad)) +
geom_boxplot()+
geom_jitter(width = 0.2, size = 2, alpha = 0.8) +
theme_minimal()+
labs(
title = "Boston",
x = "Index of accessibility to radial highways",
y = "Age"
)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## rad 8 107293 13412 22.76 <2e-16 ***
## Residuals 497 292848 589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The small p-value indicated that the mean age in this dataset is different depending on rad (index of accessibility to radial highways).
#creating a new dataset with the columns of our interest
new_B <- data_frame(crim=Boston$crim, indus=Boston$indus, rad=Boston$rad)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
new_B$rad <- factor(new_B$rad)
#performing manova analysis to see if indus and crime means are different in different categories of rad.
manova_result <- manova(cbind(indus,crim) ~ rad, data = new_B)
summary(manova_result)
## Df Pillai approx F num Df den Df Pr(>F)
## rad 8 0.6406 29.276 16 994 < 2.2e-16 ***
## Residuals 497
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggpairs(new_B, aes(color = rad)) +
theme_minimal() +
labs(
title = "Scatter Plot Matrix of percentage of land zoned for industrial purposes and crime index by Index of accessibility to radial highways"
) +
theme(
plot.title = element_text(hjust = 0.5, size=8)
)
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We got a low p-value, which indicates that there is evidence to suggest that the independent variable (rad) has a statistically significant effect on the dependent variables (indus and crim).
Now, we will again use the diabetes data to perform MANCOVA analysis.
#Testing if Glucose and Insulin means are different in different Outcome categories, when we adjust for age.
mancova_result <- manova(cbind(Glucose, Insulin) ~ Outcome + Age, data = diab_data)
summary(mancova_result)
## Df Pillai approx F num Df den Df Pr(>F)
## Outcome 1 0.22448 110.573 2 764 < 2.2e-16 ***
## Age 1 0.05034 20.249 2 764 2.698e-09 ***
## Residuals 765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
diab_data$Outcome <- factor(diab_data$Outcome)
ggpairs(diab_data[, c("Glucose", "Insulin", "Outcome")],
aes(color = Outcome)) +
theme_minimal() +
labs(
title = "Scatter Plot Matrix of Glucose and Insulin by Outcome",
x = "Dependent Variables",
y = "Dependent Variables"
) +
theme(
plot.title = element_text(hjust = 0.5) # Center the title
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The low p-value indicates that the means of Glucose and Insulin are significantly different based on Outcome/diabetic, non-diabetic/ or Age, or their interaction.