library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
For this week’s data dive, I would be using the dataset_1 which is a reformed version my my original data set and is same as what I used in last two data dives.
dataset <-read_delim("C:/Users/MSKR/MASTER'S_ADS/STATISTICS_SEM1/DATA_SET_1.csv", delim = ",")
## Rows: 4424 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Target
## dbl (36): Marital status, Application mode, Application order, Course, Dayti...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset_1<-mutate(dataset, marital_status = ifelse(dataset$`Marital status` == 1, "single",
ifelse(`Marital status` == 2, "married",
ifelse(`Marital status` == 3, "widower",
ifelse(`Marital status` == 4, "divorced",
ifelse(`Marital status` == 5, "facto union",
ifelse(`Marital status` == 6, "legally seperated", "no")))))))
dataset_1<-mutate(dataset_1,gender=ifelse(Gender==1,"male","female"))
summary(dataset_1)
## Marital status Application mode Application order Course
## Min. :1.000 Min. : 1.00 Min. :0.000 Min. : 33
## 1st Qu.:1.000 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:9085
## Median :1.000 Median :17.00 Median :1.000 Median :9238
## Mean :1.179 Mean :18.67 Mean :1.728 Mean :8857
## 3rd Qu.:1.000 3rd Qu.:39.00 3rd Qu.:2.000 3rd Qu.:9556
## Max. :6.000 Max. :57.00 Max. :9.000 Max. :9991
## Daytime/evening attendance\t Previous qualification
## Min. :0.0000 Min. : 1.000
## 1st Qu.:1.0000 1st Qu.: 1.000
## Median :1.0000 Median : 1.000
## Mean :0.8908 Mean : 4.578
## 3rd Qu.:1.0000 3rd Qu.: 1.000
## Max. :1.0000 Max. :43.000
## Previous qualification (grade) Nacionality Mother's qualification
## Min. : 95.0 Min. : 1.000 Min. : 1.00
## 1st Qu.:125.0 1st Qu.: 1.000 1st Qu.: 2.00
## Median :133.1 Median : 1.000 Median :19.00
## Mean :132.6 Mean : 1.873 Mean :19.56
## 3rd Qu.:140.0 3rd Qu.: 1.000 3rd Qu.:37.00
## Max. :190.0 Max. :109.000 Max. :44.00
## Father's qualification Mother's occupation Father's occupation Admission grade
## Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 95.0
## 1st Qu.: 3.00 1st Qu.: 4.00 1st Qu.: 4.00 1st Qu.:117.9
## Median :19.00 Median : 5.00 Median : 7.00 Median :126.1
## Mean :22.28 Mean : 10.96 Mean : 11.03 Mean :127.0
## 3rd Qu.:37.00 3rd Qu.: 9.00 3rd Qu.: 9.00 3rd Qu.:134.8
## Max. :44.00 Max. :194.00 Max. :195.00 Max. :190.0
## Displaced Educational special needs Debtor
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.5484 Mean :0.01153 Mean :0.1137
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## Tuition fees up to date Gender Scholarship holder Age at enrollment
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :17.00
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:19.00
## Median :1.0000 Median :0.0000 Median :0.0000 Median :20.00
## Mean :0.8807 Mean :0.3517 Mean :0.2484 Mean :23.27
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:25.00
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :70.00
## International Curricular units 1st sem (credited)
## Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 0.00
## Mean :0.02486 Mean : 0.71
## 3rd Qu.:0.00000 3rd Qu.: 0.00
## Max. :1.00000 Max. :20.00
## Curricular units 1st sem (enrolled) Curricular units 1st sem (evaluations)
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 5.000 1st Qu.: 6.000
## Median : 6.000 Median : 8.000
## Mean : 6.271 Mean : 8.299
## 3rd Qu.: 7.000 3rd Qu.:10.000
## Max. :26.000 Max. :45.000
## Curricular units 1st sem (approved) Curricular units 1st sem (grade)
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 3.000 1st Qu.:11.00
## Median : 5.000 Median :12.29
## Mean : 4.707 Mean :10.64
## 3rd Qu.: 6.000 3rd Qu.:13.40
## Max. :26.000 Max. :18.88
## Curricular units 1st sem (without evaluations)
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.1377
## 3rd Qu.: 0.0000
## Max. :12.0000
## Curricular units 2nd sem (credited) Curricular units 2nd sem (enrolled)
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 5.000
## Median : 0.0000 Median : 6.000
## Mean : 0.5418 Mean : 6.232
## 3rd Qu.: 0.0000 3rd Qu.: 7.000
## Max. :19.0000 Max. :23.000
## Curricular units 2nd sem (evaluations) Curricular units 2nd sem (approved)
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 6.000 1st Qu.: 2.000
## Median : 8.000 Median : 5.000
## Mean : 8.063 Mean : 4.436
## 3rd Qu.:10.000 3rd Qu.: 6.000
## Max. :33.000 Max. :20.000
## Curricular units 2nd sem (grade)
## Min. : 0.00
## 1st Qu.:10.75
## Median :12.20
## Mean :10.23
## 3rd Qu.:13.33
## Max. :18.57
## Curricular units 2nd sem (without evaluations) Unemployment rate
## Min. : 0.0000 Min. : 7.60
## 1st Qu.: 0.0000 1st Qu.: 9.40
## Median : 0.0000 Median :11.10
## Mean : 0.1503 Mean :11.57
## 3rd Qu.: 0.0000 3rd Qu.:13.90
## Max. :12.0000 Max. :16.20
## Inflation rate GDP Target marital_status
## Min. :-0.800 Min. :-4.060000 Length:4424 Length:4424
## 1st Qu.: 0.300 1st Qu.:-1.700000 Class :character Class :character
## Median : 1.400 Median : 0.320000 Mode :character Mode :character
## Mean : 1.228 Mean : 0.001969
## 3rd Qu.: 2.600 3rd Qu.: 1.790000
## Max. : 3.700 Max. : 3.510000
## gender
## Length:4424
## Class :character
## Mode :character
##
##
##
Let’s create 5 subsets from the total population of data from dataset_1.
sample_size<-nrow(dataset_1)%/%2
df1<-sample_n(dataset_1,sample_size,replace=TRUE)
df1
## # A tibble: 2,212 × 39
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 9238
## 2 1 17 1 9670
## 3 1 1 4 9773
## 4 1 1 2 9147
## 5 1 1 3 9238
## 6 1 43 1 9991
## 7 2 39 1 9556
## 8 1 17 2 9119
## 9 1 1 4 9500
## 10 1 1 1 9085
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>, …
df2<-sample_n(dataset_1,sample_size,replace=TRUE)
df2
## # A tibble: 2,212 × 39
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 9500
## 2 1 44 1 9238
## 3 1 39 1 9556
## 4 1 1 1 171
## 5 1 1 1 171
## 6 1 1 1 9147
## 7 1 1 5 9500
## 8 1 17 2 9556
## 9 1 17 2 8014
## 10 1 39 1 9119
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>, …
df3<-sample_n(dataset_1,sample_size,replace=TRUE)
df3
## # A tibble: 2,212 × 39
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 2 39 1 8014
## 2 1 39 1 9991
## 3 1 18 1 9500
## 4 1 1 5 9500
## 5 1 17 2 9500
## 6 1 18 1 9500
## 7 1 39 1 9130
## 8 1 17 3 9147
## 9 1 1 1 9238
## 10 1 44 1 9003
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>, …
df4<-sample_n(dataset_1,sample_size,replace=TRUE)
df4
## # A tibble: 2,212 × 39
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1 3 9773
## 2 1 1 1 9500
## 3 1 43 1 9991
## 4 1 44 1 9003
## 5 1 39 1 9670
## 6 1 1 1 9500
## 7 1 7 1 9500
## 8 1 43 1 9500
## 9 1 1 5 9070
## 10 1 39 1 9500
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>, …
df5<-sample_n(dataset_1,sample_size,replace=TRUE)
df5
## # A tibble: 2,212 × 39
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 1 39 1 9238
## 2 1 1 2 9773
## 3 1 43 1 9238
## 4 1 1 1 9500
## 5 1 39 1 9003
## 6 1 17 4 9500
## 7 1 17 4 9147
## 8 1 44 1 171
## 9 1 17 1 9254
## 10 1 1 1 9254
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>, …
All five sub samples have equal number of records, i.e.,2212 observations with 40 variables in each respectively.
We can analyse how each subset is divided randomly and how the average values of two metrics are affected by sampling.
Below is for the original data set:
dataset_1|>
group_by(gender)|>
summarise(count=n(),
mean_1stsemGrade=mean(`Curricular units 1st sem (grade)`),
mean_AdmGrade=mean(`Admission grade`)
)
## # A tibble: 2 × 4
## gender count mean_1stsemGrade mean_AdmGrade
## <chr> <int> <dbl> <dbl>
## 1 female 2868 11.3 127.
## 2 male 1556 9.40 127.
In the original data set, we have approximately 2:1 Female to Male ratio and Average Admission grade is higher for Male than Female by 1%.
Now, finding out how Male and Female population is divided into each sample and look into their average values of Grades obtained in 1st Semester and Admission grades into the programs.
df1|>
group_by(gender)|>
summarise(
count = n(),
mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
mean_AdmGrade = mean(`Admission grade`)
)
## # A tibble: 2 × 4
## gender count mean_1stsemGrade mean_AdmGrade
## <chr> <int> <dbl> <dbl>
## 1 female 1428 11.2 128.
## 2 male 784 9.53 128.
df2|>
group_by(gender)|>
summarise(
count = n(),
mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
mean_AdmGrade = mean(`Admission grade`)
)
## # A tibble: 2 × 4
## gender count mean_1stsemGrade mean_AdmGrade
## <chr> <int> <dbl> <dbl>
## 1 female 1443 11.2 127.
## 2 male 769 9.55 128.
df3|>
group_by(gender)|>
summarise(
count = n(),
mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
mean_AdmGrade = mean(`Admission grade`)
)
## # A tibble: 2 × 4
## gender count mean_1stsemGrade mean_AdmGrade
## <chr> <int> <dbl> <dbl>
## 1 female 1404 11.3 127.
## 2 male 808 9.63 128.
df4|>
group_by(gender)|>
summarise(
count = n(),
mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
mean_AdmGrade = mean(`Admission grade`)
)
## # A tibble: 2 × 4
## gender count mean_1stsemGrade mean_AdmGrade
## <chr> <int> <dbl> <dbl>
## 1 female 1425 11.4 128.
## 2 male 787 10.0 128.
df5|>
group_by(gender)|>
summarise(
count = n(),
mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`),
mean_AdmGrade = mean(`Admission grade`)
)
## # A tibble: 2 × 4
## gender count mean_1stsemGrade mean_AdmGrade
## <chr> <int> <dbl> <dbl>
## 1 female 1461 11.4 127.
## 2 male 751 9.55 128.
The same analysis on the whole data population looks like:
dataset_1|>
group_by(gender)|>
summarise(
count = n(),
mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`),
mean_AdmGrade = mean(`Admission grade`)
)
## # A tibble: 2 × 4
## gender count mean_1stsemGrade mean_AdmGrade
## <chr> <int> <dbl> <dbl>
## 1 female 2868 11.3 127.
## 2 male 1556 9.40 127.
In each of the samples, the Female to Male ratio is nearly 2:1 which is same as the original data set, so we are not missing on the volume ratio in the samples.
As we analysed in previous data dives, we do not have any Null values in any of the variables in our data set and all the categorical variables are properly defined with integer values.
Firstly, the average of 1st semester grades in both Female and Male in all the samples are very close or almost similar to that of the original data. so the distribution of the average is constant throughout the samples.
Secondly, the average values of the Admission grades also resembles high similarities in the samples with the original values.
To comment on the existence of any anomaly in our samples, it would be on the nationality or the marital status which would affect the determination of Target as one sample might have heavily weighted on one kind of the above combination and doesn’t guarantee a equally diversified spread of categories. But seeing the calculated metrics, we can ignore this case as the averages are consistent among all the samples.
we can conclude that, random sampling our data set with half the number of records have a consistent behavior in terms of approximations and expected values.
This strongly gives us confidence that using analytic metrics on a random sample of the population, we can draw insights which will equally apply to the entire population of the data set.
num_simulations <- 1
simulated_means <- numeric(num_simulations)
simulated_means[1] <- mean(df1$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[2] <- mean(df2$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[3] <- mean(df3$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[4] <- mean(df4$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[5] <- mean(df5$`Curricular units 1st sem (grade)`, na.rm = TRUE)
# Visualizing the distribution of simulated means
ggplot(data.frame(simulated_means), aes(x = simulated_means)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.8) +
labs(title = "Distribution of Simulated Means", x = "Mean Score", y = "Frequency")
simulated_means
## [1] 10.63321 10.62437 10.67854 10.92841 10.74548
sd(simulated_means)
## [1] 0.1249628
mean(simulated_means)
## [1] 10.722