ExamAnswer

Author

IyadAlkronz

Question 1

load the dataset

data(PimaIndiansDiabetes)

1. fill the following table

The data set is consist of 768 observations and 9 variables. Data Description for the 9 variables are as follows.

pregnant - Number of times pregnant
glucose - Plasma glucose concentration (glucose tolerance test)
pressure - Diastolic blood pressure (mm Hg)
triceps - Triceps skin fold thickness (mm)
insulin - 2-Hour serum insulin (mu U/ml)
mass - Body mass index (weight in kg/(height in m)^2)
pedigree - Diabetes pedigree function
age - Age (years)
diabetes - Class variable (test for diabetes)

2. basic summary statistics in numeric variables

numeric_vars <- PimaIndiansDiabetes[, sapply(PimaIndiansDiabetes, is.numeric)]
summary(numeric_vars)

    pregnant         glucose         pressure         triceps     
 Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
 1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
 Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
 Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
 3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
 Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
    insulin           mass          pedigree           age       
 Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
 1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
 Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
 Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
 3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
 Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

PimaIndiansDiabetes %>% group_by(diabetes) %>% summarize(total = n())

# A tibble: 2 × 2
  diabetes total
  <fct>    <int>
1 neg        500
2 pos        268

Comments The dataset shows that there were 500 cases classified as “negative” and 268 cases classified as “positive” for diabetes.

range in all variables

PimaIndiansDiabetes_factors <- data.frame(lapply(PimaIndiansDiabetes, as.character), stringsAsFactors = FALSE)
sapply(PimaIndiansDiabetes_factors, range)

     pregnant glucose pressure triceps insulin mass   pedigree age  diabetes
[1,] "0"      "0"     "0"      "0"     "0"     "0"    "0.078"  "21" "neg"   
[2,] "9"      "99"    "98"     "99"    "99"    "67.1" "2.42"   "81" "pos"

missing values in each column

missings <- colSums(is.na(PimaIndiansDiabetes))
print(missings)

pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
       0        0        0        0        0        0        0        0 
diabetes 
       0

Comments There are no missing values present in the data.

3. Body Mass Index

a

df <- PimaIndiansDiabetes %>% mutate(BMIcat = case_when(
  mass <18.5 ~"Underweight",
  mass >=18.5 & mass <= 24.9 ~"Normal",
  mass >24.9 & mass <= 29.9 ~"OverWeight",
 mass >=30 ~"Obesity",
  TRUE ~ ""
))

df <- df %>% mutate(BMIcat = factor(BMIcat, 
                   ordered = T, 
                   levels = c(
                     "Underweight",
                     "Normal",
                              "OverWeight", 
                              "Obesity" )) ) 
df %>% head() %>% knitr::kable()

pregnant	glucose	pressure	triceps	insulin	mass	pedigree	age	diabetes	BMIcat
6	148	72	35	0	33.6	0.627	50	pos	Obesity
1	85	66	29	0	26.6	0.351	31	neg	OverWeight
8	183	64	0	0	23.3	0.672	32	pos	Normal
1	89	66	23	94	28.1	0.167	21	neg	OverWeight
0	137	40	35	168	43.1	2.288	33	pos	Obesity
5	116	74	0	0	25.6	0.201	30	neg	OverWeight

b frequency and distribution

BMIcat_frequency <- df %>% group_by(BMIcat) %>% summarize(count = n()) 
BMIcat_frequency %>% knitr::kable()

BMIcat	count
Underweight	15
Normal	102
OverWeight	179
Obesity	472

BMIcat_frequency %>% ggplot(aes(x = BMIcat , y =  count)) + geom_col(aes(fill = BMIcat)) + 
  labs(title = "BMIcat Distributions")

Comments

The dataset shows that there were 15 individuals classified as “Underweight,” 102 individuals classified as “Normal,” 179 individuals classified as “Overweight,” and 472 individuals classified as “Obesity.”

As the BMI increases, the frequency also increases.

4. invistigate relation between first 8 variabels

Correlation Matrix

cor_data <- cor(df[,1:8])
#Numerical Correlation Matrix
cor_data

            pregnant    glucose   pressure     triceps     insulin       mass
pregnant  1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
glucose   0.12945867 1.00000000 0.15258959  0.05732789  0.33135711 0.22107107
pressure  0.14128198 0.15258959 1.00000000  0.20737054  0.08893338 0.28180529
triceps  -0.08167177 0.05732789 0.20737054  1.00000000  0.43678257 0.39257320
insulin  -0.07353461 0.33135711 0.08893338  0.43678257  1.00000000 0.19785906
mass      0.01768309 0.22107107 0.28180529  0.39257320  0.19785906 1.00000000
pedigree -0.03352267 0.13733730 0.04126495  0.18392757  0.18507093 0.14064695
age       0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
            pedigree         age
pregnant -0.03352267  0.54434123
glucose   0.13733730  0.26351432
pressure  0.04126495  0.23952795
triceps   0.18392757 -0.11397026
insulin   0.18507093 -0.04216295
mass      0.14064695  0.03624187
pedigree  1.00000000  0.03356131
age       0.03356131  1.00000000

Correlation matrix plots

corrplot::corrplot(cor_data)

corrplot::corrplot(cor_data, type = "lower", method = "number")

corrplot::corrplot(cor_data, type = "lower", method = "pie")

We can see that there is a moderately positive high correlation between age and pregnant.

5. Outliers in [ pressure , age]

box_plot <- function(bivar_name, bivar, data, output_var) {
  
  g_1 <- ggplot(data = data, aes(y = bivar, fill = output_var)) +
          geom_boxplot() +
          theme_bw() +
          labs( title = paste(bivar_name,"Outlier Detection", sep =" "), y = bivar_name) +
          theme(plot.title = element_text(hjust = 0.5))
   
  plot(g_1)
}

 
box_plot(bivar_name = "age", bivar = df[,"age"], data = df, output_var = df[,'diabetes'])

box_plot(bivar_name = "pressure", bivar = df[,"pressure"], data = df, output_var = df[,'diabetes'])

We notice that there are outliers present in the pressure and age variables.

How to treat outliers

Removing
Winsorization
transformation
Imputation

6. How to Capture Trend in glucose , pressure , triceps

Through the utilization of bivariate analysis

glucose & pressure

df %>% ggplot(aes(x = glucose , y  = pressure)) + geom_point()

df %>% ggplot(aes(x = glucose , y  = triceps)) + geom_point()

df %>% ggplot(aes(x = pressure , y  = triceps)) + geom_point()

7. Check Notmality

glucose

# Generate a Q-Q plot
qqnorm(df$glucose)
qqline(df$glucose, col = "red")

shapiro.test(df$glucose)


    Shapiro-Wilk normality test

data:  df$glucose
W = 0.9701, p-value = 1.986e-11

when \(\alpha = 0.05\) Therefore, with a p-value of 1.986e-11 less that \(\alpha\), we have strong evidence to suggest that the data does not follow normal distribution.

8. heat representation of data

library(gplots)

Registered S3 method overwritten by 'gplots':
  method         from     
  reorder.factor DescTools


Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess

library(RColorBrewer)

features <- PimaIndiansDiabetes[, 1:8]

features <- scale(features)

my_palette <- colorRampPalette(rev(brewer.pal(9, "Blues")))

# Create the heat map
heatmap(features, scale = "column")

9. shiny App

data(PimaIndiansDiabetes)

df <- PimaIndiansDiabetes

bmicat_list <- c("Underweight"  , "Normal" , "OverWeight" , "Obesity")
diabetes_list <- c( "neg"  , "pos")

ui <- navbarPage(
  "My Shiny App",
  sidebarLayout(
    sidebarPanel(
      selectInput("diabetes_input", "Diabetes", choices = diabetes_list),
      selectInput("bmicat_input", "BMI category", choices = bmicat_list),
      verbatimTextOutput("rowcount"),
      verbatimTextOutput("variablesCounts")
    ),
    mainPanel(
      DTOutput("summary_table"),
      
      plotOutput("distribution"),
      plotOutput("scatter_plots")
      
      
      
    )
  )
  
)


server <- function(input, output) {
  
  output$distribution <- renderPlot({
    p1 <- ggplot(data =filtered_data() ,  aes(x = glucose)) + geom_histogram() + labs(title = "Glucose Distribution")
    p2 <- ggplot(data =filtered_data() ,  aes(x = pressure)) + geom_histogram() + labs(title = "Pressure Distribution")
    p3 <- ggplot(data =filtered_data() ,  aes(x = triceps)) + geom_histogram() + labs(title = "Triceps Distribution")
    gridExtra::grid.arrange(p1, p2, p3 ,ncol=3)
    
    
  })
  
  
  
  output$scatter_plots <- renderPlot({
    
    p1 <- df %>% ggplot(aes(x = glucose , y  = pressure)) + geom_point()+ labs(title = "Relation Between" , 
                                                                               subtitle   = "glucose Vs pressure "
    )
    p2 <- df %>% ggplot(aes(x = glucose , y  = triceps)) + geom_point()+ labs(title = "Relation Between" , 
                                                                              subtitle = "glucose Vs triceps "
    )
    p3 <- df %>% ggplot(aes(x = pressure , y  = triceps)) + geom_point()+ labs(title = "Relation Between" , 
                                                                               subtitle = "pressure Vs triceps "
    )
    gridExtra::grid.arrange(p1, p2, p3 ,ncol=3)
    
    
  })
  
  
  
  diabetes_input <- reactive({
    print(input$diabetes_input)
    get(input$diabetes_input)
    
  })
  
  bmicat_input <- reactive({
    get(input$bmicat_input)
  })
  
  filtered_data <- reactive({
    selected_option_diabetes <- input$diabetes_input
    selected_option_bmicat_input <- input$bmicat_input
    df %>% filter(diabetes == selected_option_diabetes) 
    # %>% filter(BMIcat == selected_option_bmicat_input)
  })
  
  
  
  output$summary_table <- renderDT({
    filterd_data  <-  filtered_data()[1:8]
    
    summary_data <- sapply(filterd_data, summary)
    x <- as.data.frame(summary_data )
    x
    
  })
  
  
}





shinyApp(ui, server)


Listening on http://127.0.0.1:7862

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Question 2

# Subset necessary columns
df_subset <- gapminder[, c("country", "pop")]
df_subset <- df_subset %>% group_by(country) %>% summarize(mean_pop = mean(pop) / 10^6)
# Load world map dataset
world_map <- map_data("world")

# Merge data with world map dataset
merged_data <- merge(world_map, df_subset, by.x = "region", by.y = "country", all.x = TRUE)

 
# Choose a color palette
 palette <-  colorRampPalette(c("red","orange","blue") )
merged_data   <- merged_data %>% filter(  region != "Russia" )
# Create the plot
ggplot() +
  geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = mean_pop) ) +
  scale_fill_gradientn(colors = palette(100), name = "Population") +
  coord_map() +theme_void()