{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

```{r cars} # 3. Pre-process data # 3.1 Read data of the CSV file dataset=osteoporosis

Check NA value: have 2 way

any(is.na(dataset)) colSums(is.na(dataset))

Format data: change variable name

colnames(dataset) <- c(“ID”, “Age”, “Sex”, “Hormon”, “Fam”, “Race”, “Weight”, “Ca_Intake”, “D_Intake”, “PA”, “Smok”, “Alcohol”, “Condits”, “Medics”, “PreFrac”, “Osteo”) # Check data type str(dataset) summary(dataset)

Format to factor

cols_to_factor <- c(“Sex”, “Hormon”, “Fam”, “Race”, “Weight”, “Ca_Intake”, “D_Intake”, “PA”, “Smok”, “Alcohol”, “Condits”, “Medics”, “PreFrac”, “Osteo”) dataset[cols_to_factor] <- lapply(dataset[cols_to_factor], as.factor) dataset\(ID <- as.character(dataset\)ID) dataset\(Age <- as.numeric(dataset\)Age)

Check data type again

str(dataset) summary(dataset)

Check duplicate ID

duplicate_ids <- dataset[duplicated(dataset\(ID) | duplicated(dataset\)ID, fromLast = TRUE), ] print(duplicate_ids)

4. Indicated Statistics

4.1. Summary data # Visualize Age by Box plot

4.1.1.Create box plot for Age

install.packages(“ggplot2”) install.packages(“plotly”) library(ggplot2) library(plotly) # Use assumed x axis d <- ggplot(dataset, aes(x = ““, y = Age)) +
geom_boxplot(fill =”lightblue”) + stat_summary(fun = mean, geom = “point”, shape = 20, size = 3, color = “red”) + labs(title = “Box Plot of Age with Summary Statistics”, y = “Age”, x = ““) # Transform ggplot to interacted plot ggplotly(d)

One box plot according to age for each factor of a variable (Sex)

library(ggplot2) # Draw box plot q= ggplot(dataset, aes(x = as.factor(Sex), y = Age)) + geom_boxplot() + labs(x = “Factor”, y = “Age”, title = “Box Plot of Age by Sex”) + theme_minimal() ggplotly(q)

Draw Q-Q plot for Age

qqnorm(dataset\(Age, main = "Q-Q Plot of Age") qqline(dataset\)Age, col = “red”, lwd = 2)

4.1.2. Multiple box plot

install package

install.packages(“tidyverse”) library(tidyverse) library(plotly) library(ggplot2)

Transform to “l Osteo ## Transform to”long” type

long_data1 <- dataset %>% gather(key = “Variable”, value = “Value”, Sex, Hormon, Fam, Race, Weight, Ca_Intake, D_Intake) %>% select(Age, Variable, Value) ## box plot combine with facet_wrap to multiply draw p1 <- ggplot(long_data1, aes(x = Value, y = Age)) + geom_boxplot(fill = “lightblue”) + labs(title = “Boxplot of Factors by Age - Part 1”, x = “Factors”, y = “Age”) + facet_wrap(~ Variable, scales = “free_x”) + # create framework for each var theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate tile of x # Transform to interact plot ggplotly(p1)

long_data2 <- dataset %>% gather(key = “Variable”, value = “Value”, PA, Smok, Alcohol, Condits, Medics, PreFrac, Osteo) %>% select(Age, Variable, Value) ## box plot combine with facet_wrap to multiply draw p2 <- ggplot(long_data2, aes(x = Value, y = Age)) + geom_boxplot(fill = “lightblue”) + labs(title = “Boxplot of Factors by Age - Part 2”, x = “Factors”, y = “Age”) + facet_wrap(~ Variable, scales = “free_x”) + # create framework for each var theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate tile of x # Transform to interact plot ggplotly ggplotly(p2)

4.1.3. Indicate target var: Osteo

Pie chart

osteo_counts <- table(dataset$Osteo) labels <- ifelse(names(osteo_counts) == “0”, “No”, “Yes”) labels <- paste(labels, osteo_counts, sep = “:”) pie(osteo_counts, main = “Percentage of Osteoperosis people”, col = rainbow(length(osteo_counts)), labels = labels) # Draw multiple box plot for demographic var: Age, Sex, Race q= ggplot(dataset, aes(Sex, Age)) + geom_boxplot(aes(fill = Osteo)) + labs(x = “Factor”, y = “Age”, title = “Box Plot of Factor by Age”) + facet_grid(Race~Osteo, margins = T) ggplotly(q)

4.2. Normal distribution test for age

4.2.1. Statistics test - Shapiro

shapiro_test_result <- shapiro.test(dataset\(Age) print(shapiro_test_result) ## For `Osteo` =1 no_osteoporosis_data <- subset(dataset, Osteo=="1") shapiro_test_result <- shapiro.test(no_osteoporosis_data\)Age) print(shapiro_test_result)

ggplot(dataset, aes(x = no_osteoporosis_data)) +geom_histogram()

4.2.2. Graphic method

histogram with normal distribution line

library(ggplot2) ggplot(dataset, aes(x = Age)) + geom_histogram(aes(y = ..density..), bins = 30, fill = “skyblue”, color = “black”) + stat_function(fun = dnorm, args = list(mean = mean(dataset\(Age), sd = sd(dataset\)Age)), color = “red”, size = 1) + labs(title = “Histogram of Age with Normal Distribution Curve”, x = “Age”, y = “Density”)

5. Prediction Statistics

5.1. Correlation test

5.1.1. Chi-square Test for 13 var of factor type

variables <- c(“Sex”, “Hormon”, “Fam”, “Race”, “Weight”, “Ca_Intake”, “D_Intake”, “PA”, “Smok”, “Alcohol”, “Condits”, “Medics”, “PreFrac”) for (var in variables) { cat(“Chi-square for:”, var, “”) test_result <- chisq.test(table(dataset[[var]], dataset$Osteo)) print(test_result) cat(“”) }

5.2. t-test for numeric type var: Age

t_test_result <- t.test(Age ~ Osteo, data = dataset) print(t_test_result)

Doubled-check using Mann-Whitney

mann_whitney_result <- wilcox.test(Age~ Osteo, data = dataset) print(mann_whitney_result)

5.3. Interaction using logic regression

Create list for each group

formulas <- list( Demographic = Osteo ~ Age * Sex * Hormon, Genetic = Osteo ~ Fam * Race, Nutrient = Osteo ~ Weight * Ca_Intake * D_Intake, Lifestyle = Osteo ~ PA * Alcohol * Smok, Medical = Osteo ~ Condits * Medics * PreFrac ) # Use void loop to run glm results <- list() for (name in names(formulas)) { model <- glm(formulas[[name]], data = dataset, family = binomial) results[[name]] <- summary(model) cat(“## Summary for”, name, “factors:”) print(results[[name]]) }

5.*. Bonus: Estimate

#other group: weight, conditions, medics model_interaction <- glm(Osteo ~ Age* Weight PA PreFrac, data = dataset, family = binomial) summary(model_interaction)

6. Discussion

Ques 1: How does the man have Postmenopausal? 1/4 of sum

get_mode <- function(v) { uniqv <- unique(v) uniqv[which.max(tabulate(match(v, uniqv)))] }

Male_Menopause <- subset(dataset, Sex == “Male” & Hormon == “Postmenopausal”) print(Male_Menopause) Ques1 <- ggplot(Male_Menopause, aes(Sex, Age)) + geom_boxplot() + stat_summary(fun = get_mode, geom = “point”, color = “red”, size = 3, shape = 8) + # Thêm mode labs(x = “Factor”, y = “Age”, title = “Box Plot of Female_Menopause”) ggplotly(Ques1)

Ques 2: Why do women have Postmenopausal extremely early? 18

Female_Menopause <- subset(dataset, Sex == “Female” & Hormon == “Postmenopausal”) print(Female_Menopause) Ques2 <- ggplot(Female_Menopause, aes(Sex, Age)) + geom_boxplot() + stat_summary(fun = get_mode, geom = “point”, color = “red”, size = 3, shape = 8) + # Thêm mode labs(x = “Factor”, y = “Age”, title = “Box Plot of Female_Menopause”) ggplotly(Ques2)

summary(cars)


## Including Plots

You can also embed plots, for example:

```{r pressure, echo=FALSE}
plot(pressure)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.