# Load the dplyr library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(ggplot2)
mpg<- read_delim("C:/Users/kondo/OneDrive/Desktop/INTRO to Statistics and R/Data Set and work/data.csv", delim = ";",show_col_types = FALSE)
# Grouping by 'category' and calculating probabilities
grouped_data_1 <- mpg %>%
group_by(Target) %>%
summarise(
Curricular_units_1st_sem_credited_avg_value = mean(`Curricular units 1st sem (credited)`),
count_Curricular_units_1st_sem_credited = n(),
.groups = "drop"
) %>%
mutate(
expected_prob_Curricular_units_1sem_credited = 1 / count_Curricular_units_1st_sem_credited,
`Curricular units 1st sem (credited) anomaly` = ifelse(expected_prob_Curricular_units_1sem_credited > 0.0005, "Normal", "Anomaly")
)
print(grouped_data_1)
## # A tibble: 3 × 5
## Target Curricular_units_1st_…¹ count_Curricular_uni…² expected_prob_Curric…³
## <chr> <dbl> <int> <dbl>
## 1 Dropout 0.609 1421 0.000704
## 2 Enrolled 0.508 794 0.00126
## 3 Graduate 0.847 2209 0.000453
## # ℹ abbreviated names: ¹Curricular_units_1st_sem_credited_avg_value,
## # ²count_Curricular_units_1st_sem_credited,
## # ³expected_prob_Curricular_units_1sem_credited
## # ℹ 1 more variable: `Curricular units 1st sem (credited) anomaly` <chr>
# Grouping by two different numeric_column ranges, and calculating probabilities
grouped_data_2 <- mpg %>%
group_by(Course=cut(Course,breaks=c(0,33,171,8014,9119,9773,9991)), `Previous qualification (grade)` = cut(`Previous qualification (grade)`, breaks = c(0,50,100,150,200))) %>%
summarise(
Unemployment_rate_avg_value = mean(`Unemployment rate`),
count_unemployment_rate = n(),
.groups = "drop"
) %>%
mutate(
expected_prob_Unemployement_rate = 1 / count_unemployment_rate,
#print(expected_prob_Unemployement_rate),
`Unemployment rate anomaly` = ifelse(expected_prob_Unemployement_rate > 0.05, "Normal", "Anomaly"),
#print(`Unemployment rate anomaly`)
)
print(grouped_data_2)
## # A tibble: 17 × 6
## Course Previous qualificati…¹ Unemployment_rate_av…² count_unemployment_r…³
## <fct> <fct> <dbl> <int>
## 1 (0,33] (50,100] 16.2 1
## 2 (0,33] (100,150] 11.9 11
## 3 (33,171] (50,100] 10.8 1
## 4 (33,171] (100,150] 11.4 181
## 5 (33,171] (150,200] 11.5 33
## 6 (171,8.… (50,100] 9.52 19
## 7 (171,8.… (100,150] 11.1 183
## 8 (171,8.… (150,200] 11.8 13
## 9 (8.01e+… (50,100] 12.8 8
## 10 (8.01e+… (100,150] 11.9 855
## 11 (8.01e+… (150,200] 12.1 80
## 12 (9.12e+… (50,100] 11.2 42
## 13 (9.12e+… (100,150] 11.6 2370
## 14 (9.12e+… (150,200] 11.7 167
## 15 (9.77e+… (50,100] 11.6 11
## 16 (9.77e+… (100,150] 11.1 420
## 17 (9.77e+… (150,200] 10.7 29
## # ℹ abbreviated names: ¹`Previous qualification (grade)`,
## # ²Unemployment_rate_avg_value, ³count_unemployment_rate
## # ℹ 2 more variables: expected_prob_Unemployement_rate <dbl>,
## # `Unemployment rate anomaly` <chr>
# Grouping by two different numeric_column ranges, and calculating probabilities
grouped_data_3 <- mpg %>%
group_by(Target, `Previous qualification (grade)` = cut(`Previous qualification (grade)`, breaks = c(0,50,100,150,200))) %>%
summarise(
GDP_avg_value = mean(GDP),
GDP_count = n(),
.groups = "drop"
) %>%
mutate(
expected_prob_GDP = 1 / GDP_count,
GDP_anomaly = ifelse(expected_prob_GDP > 0.0005, "Normal", "Anomaly")
)
print(grouped_data_3)
## # A tibble: 9 × 6
## Target Previous qualification (g…¹ GDP_avg_value GDP_count expected_prob_GDP
## <chr> <fct> <dbl> <int> <dbl>
## 1 Dropout (50,100] -0.442 31 0.0323
## 2 Dropout (100,150] -0.129 1313 0.000762
## 3 Dropout (150,200] -0.399 77 0.0130
## 4 Enrolled (50,100] -0.546 16 0.0625
## 5 Enrolled (100,150] 0.0902 734 0.00136
## 6 Enrolled (150,200] -0.345 44 0.0227
## 7 Graduate (50,100] -0.454 35 0.0286
## 8 Graduate (100,150] 0.117 1973 0.000507
## 9 Graduate (150,200] -0.170 201 0.00498
## # ℹ abbreviated name: ¹`Previous qualification (grade)`
## # ℹ 1 more variable: GDP_anomaly <chr>
barplot(grouped_data_1$expected_prob_Curricular_units_1sem_credited, names.arg = grouped_data_1$Target,
main = "Expected Probability by Target", ylab = "Expected Probability of Units credited in 1st sem")

barplot(grouped_data_2$expected_prob_Unemployement_rate, names.arg = grouped_data_2$Course,
main = "Expected Probability by Course", ylab = "Expected Probability of unemployement rate")

barplot(grouped_data_3$expected_prob_GDP, names.arg = grouped_data_3$Target,
main = "Expected Probability by Target", ylab = "Expected Probability of GDP")
