ExamAnswer

Author

IyadAlkronz

Question 1

load the dataset

data(PimaIndiansDiabetes)

1. fill the following table

The data set is consist of 768 observations and 9 variables. Data Description for the 9 variables are as follows.

  1. pregnant - Number of times pregnant
  2. glucose - Plasma glucose concentration (glucose tolerance test)
  3. pressure - Diastolic blood pressure (mm Hg)
  4. triceps - Triceps skin fold thickness (mm)
  5. insulin - 2-Hour serum insulin (mu U/ml)
  6. mass - Body mass index (weight in kg/(height in m)^2)
  7. pedigree - Diabetes pedigree function
  8. age - Age (years)
  9. diabetes - Class variable (test for diabetes)

2. basic summary statistics in numeric variables

numeric_vars <- PimaIndiansDiabetes[, sapply(PimaIndiansDiabetes, is.numeric)]
summary(numeric_vars)
    pregnant         glucose         pressure         triceps     
 Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
 1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
 Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
 Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
 3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
 Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
    insulin           mass          pedigree           age       
 Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
 1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
 Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
 Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
 3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
 Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
PimaIndiansDiabetes %>% group_by(diabetes) %>% summarize(total = n())
# A tibble: 2 × 2
  diabetes total
  <fct>    <int>
1 neg        500
2 pos        268

Comments The dataset shows that there were 500 cases classified as “negative” and 268 cases classified as “positive” for diabetes.

range in all variables

PimaIndiansDiabetes_factors <- data.frame(lapply(PimaIndiansDiabetes, as.character), stringsAsFactors = FALSE)
sapply(PimaIndiansDiabetes_factors, range)
     pregnant glucose pressure triceps insulin mass   pedigree age  diabetes
[1,] "0"      "0"     "0"      "0"     "0"     "0"    "0.078"  "21" "neg"   
[2,] "9"      "99"    "98"     "99"    "99"    "67.1" "2.42"   "81" "pos"   

missing values in each column

missings <- colSums(is.na(PimaIndiansDiabetes))
print(missings)
pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
       0        0        0        0        0        0        0        0 
diabetes 
       0 

Comments There are no missing values present in the data.

3. Body Mass Index

a

df <- PimaIndiansDiabetes %>% mutate(BMIcat = case_when(
  mass <18.5 ~"Underweight",
  mass >=18.5 & mass <= 24.9 ~"Normal",
  mass >24.9 & mass <= 29.9 ~"OverWeight",
 mass >=30 ~"Obesity",
  TRUE ~ ""
))

df <- df %>% mutate(BMIcat = factor(BMIcat, 
                   ordered = T, 
                   levels = c(
                     "Underweight",
                     "Normal",
                              "OverWeight", 
                              "Obesity" )) ) 
df %>% head() %>% knitr::kable()
pregnant glucose pressure triceps insulin mass pedigree age diabetes BMIcat
6 148 72 35 0 33.6 0.627 50 pos Obesity
1 85 66 29 0 26.6 0.351 31 neg OverWeight
8 183 64 0 0 23.3 0.672 32 pos Normal
1 89 66 23 94 28.1 0.167 21 neg OverWeight
0 137 40 35 168 43.1 2.288 33 pos Obesity
5 116 74 0 0 25.6 0.201 30 neg OverWeight

b frequency and distribution

BMIcat_frequency <- df %>% group_by(BMIcat) %>% summarize(count = n()) 
BMIcat_frequency %>% knitr::kable()
BMIcat count
Underweight 15
Normal 102
OverWeight 179
Obesity 472
BMIcat_frequency %>% ggplot(aes(x = BMIcat , y =  count)) + geom_col(aes(fill = BMIcat)) + 
  labs(title = "BMIcat Distributions")

Comments

The dataset shows that there were 15 individuals classified as “Underweight,” 102 individuals classified as “Normal,” 179 individuals classified as “Overweight,” and 472 individuals classified as “Obesity.”

As the BMI increases, the frequency also increases.

4. invistigate relation between first 8 variabels

Correlation Matrix

cor_data <- cor(df[,1:8])
#Numerical Correlation Matrix
cor_data
            pregnant    glucose   pressure     triceps     insulin       mass
pregnant  1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
glucose   0.12945867 1.00000000 0.15258959  0.05732789  0.33135711 0.22107107
pressure  0.14128198 0.15258959 1.00000000  0.20737054  0.08893338 0.28180529
triceps  -0.08167177 0.05732789 0.20737054  1.00000000  0.43678257 0.39257320
insulin  -0.07353461 0.33135711 0.08893338  0.43678257  1.00000000 0.19785906
mass      0.01768309 0.22107107 0.28180529  0.39257320  0.19785906 1.00000000
pedigree -0.03352267 0.13733730 0.04126495  0.18392757  0.18507093 0.14064695
age       0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
            pedigree         age
pregnant -0.03352267  0.54434123
glucose   0.13733730  0.26351432
pressure  0.04126495  0.23952795
triceps   0.18392757 -0.11397026
insulin   0.18507093 -0.04216295
mass      0.14064695  0.03624187
pedigree  1.00000000  0.03356131
age       0.03356131  1.00000000

Correlation matrix plots

corrplot::corrplot(cor_data)

corrplot::corrplot(cor_data, type = "lower", method = "number")

corrplot::corrplot(cor_data, type = "lower", method = "pie")

We can see that there is a moderately positive high correlation between age and pregnant.

5. Outliers in [ pressure , age]

box_plot <- function(bivar_name, bivar, data, output_var) {
  
  g_1 <- ggplot(data = data, aes(y = bivar, fill = output_var)) +
          geom_boxplot() +
          theme_bw() +
          labs( title = paste(bivar_name,"Outlier Detection", sep =" "), y = bivar_name) +
          theme(plot.title = element_text(hjust = 0.5))
   
  plot(g_1)
}

 
box_plot(bivar_name = "age", bivar = df[,"age"], data = df, output_var = df[,'diabetes'])

box_plot(bivar_name = "pressure", bivar = df[,"pressure"], data = df, output_var = df[,'diabetes'])

We notice that there are outliers present in the pressure and age variables.

How to treat outliers

  1. Removing
  2. Winsorization
  3. transformation
  4. Imputation

6. How to Capture Trend in glucose , pressure , triceps

Through the utilization of bivariate analysis

glucose & pressure

df %>% ggplot(aes(x = glucose , y  = pressure)) + geom_point()

df %>% ggplot(aes(x = glucose , y  = triceps)) + geom_point()

df %>% ggplot(aes(x = pressure , y  = triceps)) + geom_point()

7. Check Notmality

glucose

# Generate a Q-Q plot
qqnorm(df$glucose)
qqline(df$glucose, col = "red")

shapiro.test(df$glucose)

    Shapiro-Wilk normality test

data:  df$glucose
W = 0.9701, p-value = 1.986e-11

when \(\alpha = 0.05\) Therefore, with a p-value of 1.986e-11 less that \(\alpha\), we have strong evidence to suggest that the data does not follow normal distribution.

8. heat representation of data

library(gplots)
Registered S3 method overwritten by 'gplots':
  method         from     
  reorder.factor DescTools

Attaching package: 'gplots'
The following object is masked from 'package:stats':

    lowess
library(RColorBrewer)

features <- PimaIndiansDiabetes[, 1:8]

features <- scale(features)

my_palette <- colorRampPalette(rev(brewer.pal(9, "Blues")))

# Create the heat map
heatmap(features, scale = "column")

9. shiny App

data(PimaIndiansDiabetes)

df <- PimaIndiansDiabetes

bmicat_list <- c("Underweight"  , "Normal" , "OverWeight" , "Obesity")
diabetes_list <- c( "neg"  , "pos")

ui <- navbarPage(
  "My Shiny App",
  sidebarLayout(
    sidebarPanel(
      selectInput("diabetes_input", "Diabetes", choices = diabetes_list),
      selectInput("bmicat_input", "BMI category", choices = bmicat_list),
      verbatimTextOutput("rowcount"),
      verbatimTextOutput("variablesCounts")
    ),
    mainPanel(
      DTOutput("summary_table"),
      
      plotOutput("distribution"),
      plotOutput("scatter_plots")
      
      
      
    )
  )
  
)


server <- function(input, output) {
  
  output$distribution <- renderPlot({
    p1 <- ggplot(data =filtered_data() ,  aes(x = glucose)) + geom_histogram() + labs(title = "Glucose Distribution")
    p2 <- ggplot(data =filtered_data() ,  aes(x = pressure)) + geom_histogram() + labs(title = "Pressure Distribution")
    p3 <- ggplot(data =filtered_data() ,  aes(x = triceps)) + geom_histogram() + labs(title = "Triceps Distribution")
    gridExtra::grid.arrange(p1, p2, p3 ,ncol=3)
    
    
  })
  
  
  
  output$scatter_plots <- renderPlot({
    
    p1 <- df %>% ggplot(aes(x = glucose , y  = pressure)) + geom_point()+ labs(title = "Relation Between" , 
                                                                               subtitle   = "glucose Vs pressure "
    )
    p2 <- df %>% ggplot(aes(x = glucose , y  = triceps)) + geom_point()+ labs(title = "Relation Between" , 
                                                                              subtitle = "glucose Vs triceps "
    )
    p3 <- df %>% ggplot(aes(x = pressure , y  = triceps)) + geom_point()+ labs(title = "Relation Between" , 
                                                                               subtitle = "pressure Vs triceps "
    )
    gridExtra::grid.arrange(p1, p2, p3 ,ncol=3)
    
    
  })
  
  
  
  diabetes_input <- reactive({
    print(input$diabetes_input)
    get(input$diabetes_input)
    
  })
  
  bmicat_input <- reactive({
    get(input$bmicat_input)
  })
  
  filtered_data <- reactive({
    selected_option_diabetes <- input$diabetes_input
    selected_option_bmicat_input <- input$bmicat_input
    df %>% filter(diabetes == selected_option_diabetes) 
    # %>% filter(BMIcat == selected_option_bmicat_input)
  })
  
  
  
  output$summary_table <- renderDT({
    filterd_data  <-  filtered_data()[1:8]
    
    summary_data <- sapply(filterd_data, summary)
    x <- as.data.frame(summary_data )
    x
    
  })
  
  
}





shinyApp(ui, server)

Listening on http://127.0.0.1:7862
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Question 2

# Subset necessary columns
df_subset <- gapminder[, c("country", "pop")]
df_subset <- df_subset %>% group_by(country) %>% summarize(mean_pop = mean(pop) / 10^6)
# Load world map dataset
world_map <- map_data("world")

# Merge data with world map dataset
merged_data <- merge(world_map, df_subset, by.x = "region", by.y = "country", all.x = TRUE)

 
# Choose a color palette
 palette <-  colorRampPalette(c("red","orange","blue") )
merged_data   <- merged_data %>% filter(  region != "Russia" )
# Create the plot
ggplot() +
  geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = mean_pop) ) +
  scale_fill_gradientn(colors = palette(100), name = "Population") +
  coord_map() +theme_void()