data(PimaIndiansDiabetes)ExamAnswer
Question 1
load the dataset
1. fill the following table
The data set is consist of 768 observations and 9 variables. Data Description for the 9 variables are as follows.
- pregnant - Number of times pregnant
- glucose - Plasma glucose concentration (glucose tolerance test)
- pressure - Diastolic blood pressure (mm Hg)
- triceps - Triceps skin fold thickness (mm)
- insulin - 2-Hour serum insulin (mu U/ml)
- mass - Body mass index (weight in kg/(height in m)^2)
- pedigree - Diabetes pedigree function
- age - Age (years)
- diabetes - Class variable (test for diabetes)
2. basic summary statistics in numeric variables
numeric_vars <- PimaIndiansDiabetes[, sapply(PimaIndiansDiabetes, is.numeric)]
summary(numeric_vars) pregnant glucose pressure triceps
Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
insulin mass pedigree age
Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
PimaIndiansDiabetes %>% group_by(diabetes) %>% summarize(total = n())# A tibble: 2 × 2
diabetes total
<fct> <int>
1 neg 500
2 pos 268
Comments The dataset shows that there were 500 cases classified as “negative” and 268 cases classified as “positive” for diabetes.
range in all variables
PimaIndiansDiabetes_factors <- data.frame(lapply(PimaIndiansDiabetes, as.character), stringsAsFactors = FALSE)
sapply(PimaIndiansDiabetes_factors, range) pregnant glucose pressure triceps insulin mass pedigree age diabetes
[1,] "0" "0" "0" "0" "0" "0" "0.078" "21" "neg"
[2,] "9" "99" "98" "99" "99" "67.1" "2.42" "81" "pos"
missing values in each column
missings <- colSums(is.na(PimaIndiansDiabetes))
print(missings)pregnant glucose pressure triceps insulin mass pedigree age
0 0 0 0 0 0 0 0
diabetes
0
Comments There are no missing values present in the data.
3. Body Mass Index
a
df <- PimaIndiansDiabetes %>% mutate(BMIcat = case_when(
mass <18.5 ~"Underweight",
mass >=18.5 & mass <= 24.9 ~"Normal",
mass >24.9 & mass <= 29.9 ~"OverWeight",
mass >=30 ~"Obesity",
TRUE ~ ""
))
df <- df %>% mutate(BMIcat = factor(BMIcat,
ordered = T,
levels = c(
"Underweight",
"Normal",
"OverWeight",
"Obesity" )) )
df %>% head() %>% knitr::kable()| pregnant | glucose | pressure | triceps | insulin | mass | pedigree | age | diabetes | BMIcat |
|---|---|---|---|---|---|---|---|---|---|
| 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | pos | Obesity |
| 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | neg | OverWeight |
| 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | pos | Normal |
| 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | neg | OverWeight |
| 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | pos | Obesity |
| 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | neg | OverWeight |
b frequency and distribution
BMIcat_frequency <- df %>% group_by(BMIcat) %>% summarize(count = n())
BMIcat_frequency %>% knitr::kable()| BMIcat | count |
|---|---|
| Underweight | 15 |
| Normal | 102 |
| OverWeight | 179 |
| Obesity | 472 |
BMIcat_frequency %>% ggplot(aes(x = BMIcat , y = count)) + geom_col(aes(fill = BMIcat)) +
labs(title = "BMIcat Distributions")Comments
The dataset shows that there were 15 individuals classified as “Underweight,” 102 individuals classified as “Normal,” 179 individuals classified as “Overweight,” and 472 individuals classified as “Obesity.”
As the BMI increases, the frequency also increases.
4. invistigate relation between first 8 variabels
Correlation Matrix
cor_data <- cor(df[,1:8])
#Numerical Correlation Matrix
cor_data pregnant glucose pressure triceps insulin mass
pregnant 1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
glucose 0.12945867 1.00000000 0.15258959 0.05732789 0.33135711 0.22107107
pressure 0.14128198 0.15258959 1.00000000 0.20737054 0.08893338 0.28180529
triceps -0.08167177 0.05732789 0.20737054 1.00000000 0.43678257 0.39257320
insulin -0.07353461 0.33135711 0.08893338 0.43678257 1.00000000 0.19785906
mass 0.01768309 0.22107107 0.28180529 0.39257320 0.19785906 1.00000000
pedigree -0.03352267 0.13733730 0.04126495 0.18392757 0.18507093 0.14064695
age 0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
pedigree age
pregnant -0.03352267 0.54434123
glucose 0.13733730 0.26351432
pressure 0.04126495 0.23952795
triceps 0.18392757 -0.11397026
insulin 0.18507093 -0.04216295
mass 0.14064695 0.03624187
pedigree 1.00000000 0.03356131
age 0.03356131 1.00000000
Correlation matrix plots
corrplot::corrplot(cor_data)corrplot::corrplot(cor_data, type = "lower", method = "number")corrplot::corrplot(cor_data, type = "lower", method = "pie")We can see that there is a moderately positive high correlation between age and pregnant.
5. Outliers in [ pressure , age]
box_plot <- function(bivar_name, bivar, data, output_var) {
g_1 <- ggplot(data = data, aes(y = bivar, fill = output_var)) +
geom_boxplot() +
theme_bw() +
labs( title = paste(bivar_name,"Outlier Detection", sep =" "), y = bivar_name) +
theme(plot.title = element_text(hjust = 0.5))
plot(g_1)
}
box_plot(bivar_name = "age", bivar = df[,"age"], data = df, output_var = df[,'diabetes'])box_plot(bivar_name = "pressure", bivar = df[,"pressure"], data = df, output_var = df[,'diabetes'])We notice that there are outliers present in the pressure and age variables.
How to treat outliers
- Removing
- Winsorization
- transformation
- Imputation
6. How to Capture Trend in glucose , pressure , triceps
Through the utilization of bivariate analysis
glucose & pressure
df %>% ggplot(aes(x = glucose , y = pressure)) + geom_point()df %>% ggplot(aes(x = glucose , y = triceps)) + geom_point()df %>% ggplot(aes(x = pressure , y = triceps)) + geom_point()7. Check Notmality
glucose
# Generate a Q-Q plot
qqnorm(df$glucose)
qqline(df$glucose, col = "red")shapiro.test(df$glucose)
Shapiro-Wilk normality test
data: df$glucose
W = 0.9701, p-value = 1.986e-11
when \(\alpha = 0.05\) Therefore, with a p-value of 1.986e-11 less that \(\alpha\), we have strong evidence to suggest that the data does not follow normal distribution.
8. heat representation of data
library(gplots)Registered S3 method overwritten by 'gplots':
method from
reorder.factor DescTools
Attaching package: 'gplots'
The following object is masked from 'package:stats':
lowess
library(RColorBrewer)
features <- PimaIndiansDiabetes[, 1:8]
features <- scale(features)
my_palette <- colorRampPalette(rev(brewer.pal(9, "Blues")))
# Create the heat map
heatmap(features, scale = "column")9. shiny App
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
bmicat_list <- c("Underweight" , "Normal" , "OverWeight" , "Obesity")
diabetes_list <- c( "neg" , "pos")
ui <- navbarPage(
"My Shiny App",
sidebarLayout(
sidebarPanel(
selectInput("diabetes_input", "Diabetes", choices = diabetes_list),
selectInput("bmicat_input", "BMI category", choices = bmicat_list),
verbatimTextOutput("rowcount"),
verbatimTextOutput("variablesCounts")
),
mainPanel(
DTOutput("summary_table"),
plotOutput("distribution"),
plotOutput("scatter_plots")
)
)
)
server <- function(input, output) {
output$distribution <- renderPlot({
p1 <- ggplot(data =filtered_data() , aes(x = glucose)) + geom_histogram() + labs(title = "Glucose Distribution")
p2 <- ggplot(data =filtered_data() , aes(x = pressure)) + geom_histogram() + labs(title = "Pressure Distribution")
p3 <- ggplot(data =filtered_data() , aes(x = triceps)) + geom_histogram() + labs(title = "Triceps Distribution")
gridExtra::grid.arrange(p1, p2, p3 ,ncol=3)
})
output$scatter_plots <- renderPlot({
p1 <- df %>% ggplot(aes(x = glucose , y = pressure)) + geom_point()+ labs(title = "Relation Between" ,
subtitle = "glucose Vs pressure "
)
p2 <- df %>% ggplot(aes(x = glucose , y = triceps)) + geom_point()+ labs(title = "Relation Between" ,
subtitle = "glucose Vs triceps "
)
p3 <- df %>% ggplot(aes(x = pressure , y = triceps)) + geom_point()+ labs(title = "Relation Between" ,
subtitle = "pressure Vs triceps "
)
gridExtra::grid.arrange(p1, p2, p3 ,ncol=3)
})
diabetes_input <- reactive({
print(input$diabetes_input)
get(input$diabetes_input)
})
bmicat_input <- reactive({
get(input$bmicat_input)
})
filtered_data <- reactive({
selected_option_diabetes <- input$diabetes_input
selected_option_bmicat_input <- input$bmicat_input
df %>% filter(diabetes == selected_option_diabetes)
# %>% filter(BMIcat == selected_option_bmicat_input)
})
output$summary_table <- renderDT({
filterd_data <- filtered_data()[1:8]
summary_data <- sapply(filterd_data, summary)
x <- as.data.frame(summary_data )
x
})
}
shinyApp(ui, server)
Listening on http://127.0.0.1:7862
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Question 2
# Subset necessary columns
df_subset <- gapminder[, c("country", "pop")]
df_subset <- df_subset %>% group_by(country) %>% summarize(mean_pop = mean(pop) / 10^6)
# Load world map dataset
world_map <- map_data("world")
# Merge data with world map dataset
merged_data <- merge(world_map, df_subset, by.x = "region", by.y = "country", all.x = TRUE)
# Choose a color palette
palette <- colorRampPalette(c("red","orange","blue") )
merged_data <- merged_data %>% filter( region != "Russia" )
# Create the plot
ggplot() +
geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = mean_pop) ) +
scale_fill_gradientn(colors = palette(100), name = "Population") +
coord_map() +theme_void()