{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
```{r cars} # 3. Pre-process data # 3.1 Read data of the CSV file dataset=osteoporosis
any(is.na(dataset)) colSums(is.na(dataset))
colnames(dataset) <- c(“ID”, “Age”, “Sex”, “Hormon”, “Fam”, “Race”, “Weight”, “Ca_Intake”, “D_Intake”, “PA”, “Smok”, “Alcohol”, “Condits”, “Medics”, “PreFrac”, “Osteo”) # Check data type str(dataset) summary(dataset)
cols_to_factor <- c(“Sex”, “Hormon”, “Fam”, “Race”, “Weight”, “Ca_Intake”, “D_Intake”, “PA”, “Smok”, “Alcohol”, “Condits”, “Medics”, “PreFrac”, “Osteo”) dataset[cols_to_factor] <- lapply(dataset[cols_to_factor], as.factor) dataset\(ID <- as.character(dataset\)ID) dataset\(Age <- as.numeric(dataset\)Age)
str(dataset) summary(dataset)
duplicate_ids <- dataset[duplicated(dataset\(ID) | duplicated(dataset\)ID, fromLast = TRUE), ] print(duplicate_ids)
install.packages(“ggplot2”) install.packages(“plotly”)
library(ggplot2) library(plotly) # Use assumed x axis d <-
ggplot(dataset, aes(x = ““, y = Age)) +
geom_boxplot(fill =”lightblue”) + stat_summary(fun = mean, geom =
“point”, shape = 20, size = 3, color = “red”) + labs(title = “Box Plot
of Age with Summary Statistics”, y = “Age”, x = ““) # Transform ggplot
to interacted plot ggplotly(d)
library(ggplot2) # Draw box plot q= ggplot(dataset, aes(x = as.factor(Sex), y = Age)) + geom_boxplot() + labs(x = “Factor”, y = “Age”, title = “Box Plot of Age by Sex”) + theme_minimal() ggplotly(q)
qqnorm(dataset\(Age, main = "Q-Q Plot of Age") qqline(dataset\)Age, col = “red”, lwd = 2)
install.packages(“tidyverse”) library(tidyverse) library(plotly) library(ggplot2)
long_data1 <- dataset %>% gather(key = “Variable”, value =
“Value”, Sex, Hormon, Fam,
Race, Weight, Ca_Intake,
D_Intake) %>% select(Age, Variable, Value) ## box plot
combine with facet_wrap to multiply draw p1 <- ggplot(long_data1,
aes(x = Value, y = Age)) + geom_boxplot(fill = “lightblue”) + labs(title
= “Boxplot of Factors by Age - Part 1”, x = “Factors”, y = “Age”) +
facet_wrap(~ Variable, scales = “free_x”) + # create framework for each
var theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust
= 1)) # Rotate tile of x # Transform to interact plot ggplotly(p1)
long_data2 <- dataset %>% gather(key = “Variable”, value =
“Value”, PA, Smok, Alcohol,
Condits, Medics, PreFrac,
Osteo) %>% select(Age, Variable, Value) ## box plot
combine with facet_wrap to multiply draw p2 <- ggplot(long_data2,
aes(x = Value, y = Age)) + geom_boxplot(fill = “lightblue”) + labs(title
= “Boxplot of Factors by Age - Part 2”, x = “Factors”, y = “Age”) +
facet_wrap(~ Variable, scales = “free_x”) + # create framework for each
var theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust
= 1)) # Rotate tile of x # Transform to interact plot ggplotly
ggplotly(p2)
osteo_counts <- table(dataset$Osteo) labels <- ifelse(names(osteo_counts) == “0”, “No”, “Yes”) labels <- paste(labels, osteo_counts, sep = “:”) pie(osteo_counts, main = “Percentage of Osteoperosis people”, col = rainbow(length(osteo_counts)), labels = labels) # Draw multiple box plot for demographic var: Age, Sex, Race q= ggplot(dataset, aes(Sex, Age)) + geom_boxplot(aes(fill = Osteo)) + labs(x = “Factor”, y = “Age”, title = “Box Plot of Factor by Age”) + facet_grid(Race~Osteo, margins = T) ggplotly(q)
shapiro_test_result <- shapiro.test(dataset\(Age) print(shapiro_test_result) ## For `Osteo` =1 no_osteoporosis_data <- subset(dataset, Osteo=="1") shapiro_test_result <- shapiro.test(no_osteoporosis_data\)Age) print(shapiro_test_result)
ggplot(dataset, aes(x = no_osteoporosis_data)) +geom_histogram()
library(ggplot2) ggplot(dataset, aes(x = Age)) + geom_histogram(aes(y = ..density..), bins = 30, fill = “skyblue”, color = “black”) + stat_function(fun = dnorm, args = list(mean = mean(dataset\(Age), sd = sd(dataset\)Age)), color = “red”, size = 1) + labs(title = “Histogram of Age with Normal Distribution Curve”, x = “Age”, y = “Density”)
variables <- c(“Sex”, “Hormon”, “Fam”, “Race”, “Weight”, “Ca_Intake”, “D_Intake”, “PA”, “Smok”, “Alcohol”, “Condits”, “Medics”, “PreFrac”) for (var in variables) { cat(“Chi-square for:”, var, “”) test_result <- chisq.test(table(dataset[[var]], dataset$Osteo)) print(test_result) cat(“”) }
t_test_result <- t.test(Age ~ Osteo, data = dataset) print(t_test_result)
mann_whitney_result <- wilcox.test(Age~ Osteo, data = dataset) print(mann_whitney_result)
formulas <- list( Demographic = Osteo ~ Age * Sex * Hormon, Genetic = Osteo ~ Fam * Race, Nutrient = Osteo ~ Weight * Ca_Intake * D_Intake, Lifestyle = Osteo ~ PA * Alcohol * Smok, Medical = Osteo ~ Condits * Medics * PreFrac ) # Use void loop to run glm results <- list() for (name in names(formulas)) { model <- glm(formulas[[name]], data = dataset, family = binomial) results[[name]] <- summary(model) cat(“## Summary for”, name, “factors:”) print(results[[name]]) }
#other group: weight, conditions, medics model_interaction <- glm(Osteo ~ Age* Weight PA PreFrac, data = dataset, family = binomial) summary(model_interaction)
get_mode <- function(v) { uniqv <- unique(v) uniqv[which.max(tabulate(match(v, uniqv)))] }
Male_Menopause <- subset(dataset, Sex == “Male” & Hormon == “Postmenopausal”) print(Male_Menopause) Ques1 <- ggplot(Male_Menopause, aes(Sex, Age)) + geom_boxplot() + stat_summary(fun = get_mode, geom = “point”, color = “red”, size = 3, shape = 8) + # Thêm mode labs(x = “Factor”, y = “Age”, title = “Box Plot of Female_Menopause”) ggplotly(Ques1)
Female_Menopause <- subset(dataset, Sex == “Female” & Hormon == “Postmenopausal”) print(Female_Menopause) Ques2 <- ggplot(Female_Menopause, aes(Sex, Age)) + geom_boxplot() + stat_summary(fun = get_mode, geom = “point”, color = “red”, size = 3, shape = 8) + # Thêm mode labs(x = “Factor”, y = “Age”, title = “Box Plot of Female_Menopause”) ggplotly(Ques2)
summary(cars)
## Including Plots
You can also embed plots, for example:
```{r pressure, echo=FALSE}
plot(pressure)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.