Week 3 | Data Dive — Probabilities and Anomalies

 # Load the dplyr library
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.3     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(ggplot2)

mpg<- read_delim("C:/Users/kondo/OneDrive/Desktop/INTRO to Statistics and R/Data Set and work/data.csv", delim = ";",show_col_types = FALSE)

# Grouping by 'category' and calculating probabilities
grouped_data_1 <- mpg %>%
  group_by(Target) %>%
  summarise(
    Curricular_units_1st_sem_credited_avg_value = mean(`Curricular units 1st sem (credited)`),
    count_Curricular_units_1st_sem_credited = n(),
    .groups = "drop"
  ) %>%
  mutate(
    expected_prob_Curricular_units_1sem_credited = 1 / count_Curricular_units_1st_sem_credited,
    `Curricular units 1st sem (credited) anomaly` = ifelse(expected_prob_Curricular_units_1sem_credited > 0.0005, "Normal", "Anomaly")
  )

print(grouped_data_1)

## # A tibble: 3 × 5
##   Target   Curricular_units_1st_…¹ count_Curricular_uni…² expected_prob_Curric…³
##   <chr>                      <dbl>                  <int>                  <dbl>
## 1 Dropout                    0.609                   1421               0.000704
## 2 Enrolled                   0.508                    794               0.00126 
## 3 Graduate                   0.847                   2209               0.000453
## # ℹ abbreviated names: ¹Curricular_units_1st_sem_credited_avg_value,
## #   ²count_Curricular_units_1st_sem_credited,
## #   ³expected_prob_Curricular_units_1sem_credited
## # ℹ 1 more variable: `Curricular units 1st sem (credited) anomaly` <chr>

# Grouping by two different numeric_column ranges, and calculating probabilities
grouped_data_2 <- mpg %>%
  group_by(Course=cut(Course,breaks=c(0,33,171,8014,9119,9773,9991)), `Previous qualification (grade)` = cut(`Previous qualification (grade)`, breaks = c(0,50,100,150,200))) %>%
  summarise(
    Unemployment_rate_avg_value = mean(`Unemployment rate`),
    count_unemployment_rate = n(),
    .groups = "drop"
  ) %>%
  mutate(
    expected_prob_Unemployement_rate = 1 / count_unemployment_rate,
    #print(expected_prob_Unemployement_rate),
    `Unemployment rate anomaly` = ifelse(expected_prob_Unemployement_rate > 0.05, "Normal", "Anomaly"),
    #print(`Unemployment rate anomaly`)
  )

print(grouped_data_2)

## # A tibble: 17 × 6
##    Course   Previous qualificati…¹ Unemployment_rate_av…² count_unemployment_r…³
##    <fct>    <fct>                                   <dbl>                  <int>
##  1 (0,33]   (50,100]                                16.2                       1
##  2 (0,33]   (100,150]                               11.9                      11
##  3 (33,171] (50,100]                                10.8                       1
##  4 (33,171] (100,150]                               11.4                     181
##  5 (33,171] (150,200]                               11.5                      33
##  6 (171,8.… (50,100]                                 9.52                     19
##  7 (171,8.… (100,150]                               11.1                     183
##  8 (171,8.… (150,200]                               11.8                      13
##  9 (8.01e+… (50,100]                                12.8                       8
## 10 (8.01e+… (100,150]                               11.9                     855
## 11 (8.01e+… (150,200]                               12.1                      80
## 12 (9.12e+… (50,100]                                11.2                      42
## 13 (9.12e+… (100,150]                               11.6                    2370
## 14 (9.12e+… (150,200]                               11.7                     167
## 15 (9.77e+… (50,100]                                11.6                      11
## 16 (9.77e+… (100,150]                               11.1                     420
## 17 (9.77e+… (150,200]                               10.7                      29
## # ℹ abbreviated names: ¹`Previous qualification (grade)`,
## #   ²Unemployment_rate_avg_value, ³count_unemployment_rate
## # ℹ 2 more variables: expected_prob_Unemployement_rate <dbl>,
## #   `Unemployment rate anomaly` <chr>

# Grouping by two different numeric_column ranges, and calculating probabilities
grouped_data_3 <- mpg %>%
  group_by(Target, `Previous qualification (grade)` = cut(`Previous qualification (grade)`, breaks = c(0,50,100,150,200))) %>%
  summarise(
    GDP_avg_value = mean(GDP),
    GDP_count = n(),
    .groups = "drop"
  ) %>%
  mutate(
    expected_prob_GDP = 1 / GDP_count,
    GDP_anomaly = ifelse(expected_prob_GDP > 0.0005, "Normal", "Anomaly")
  )

print(grouped_data_3)

## # A tibble: 9 × 6
##   Target   Previous qualification (g…¹ GDP_avg_value GDP_count expected_prob_GDP
##   <chr>    <fct>                               <dbl>     <int>             <dbl>
## 1 Dropout  (50,100]                          -0.442         31          0.0323  
## 2 Dropout  (100,150]                         -0.129       1313          0.000762
## 3 Dropout  (150,200]                         -0.399         77          0.0130  
## 4 Enrolled (50,100]                          -0.546         16          0.0625  
## 5 Enrolled (100,150]                          0.0902       734          0.00136 
## 6 Enrolled (150,200]                         -0.345         44          0.0227  
## 7 Graduate (50,100]                          -0.454         35          0.0286  
## 8 Graduate (100,150]                          0.117       1973          0.000507
## 9 Graduate (150,200]                         -0.170        201          0.00498 
## # ℹ abbreviated name: ¹`Previous qualification (grade)`
## # ℹ 1 more variable: GDP_anomaly <chr>

barplot(grouped_data_1$expected_prob_Curricular_units_1sem_credited, names.arg = grouped_data_1$Target, 
        main = "Expected Probability by Target", ylab = "Expected Probability of Units credited in 1st sem")

barplot(grouped_data_2$expected_prob_Unemployement_rate, names.arg = grouped_data_2$Course, 
        main = "Expected Probability by Course", ylab = "Expected Probability of unemployement rate")

barplot(grouped_data_3$expected_prob_GDP, names.arg = grouped_data_3$Target, 
        main = "Expected Probability by Target", ylab = "Expected Probability of GDP")

Week 3 | Data Dive — Probabilities and Anomalies

Vaishali Kondoju

2023-09-11