# Load the data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(skimr)
# Load the dataset from the UCI Machine Learning Repository
data <- read_delim("C:/Users/kondo/Desktop/OneDrive/Desktop/INTRO to Statistics and R/Data Set and work/data.csv", delim = ";",show_col_types = FALSE)
glimpse(data)
## Rows: 4,424
## Columns: 37
## $ `Marital status` <dbl> 1, 1, 1, 1, 2, 2, 1, …
## $ `Application mode` <dbl> 17, 15, 1, 17, 39, 39…
## $ `Application order` <dbl> 5, 1, 5, 2, 1, 1, 1, …
## $ Course <dbl> 171, 9254, 9070, 9773…
## $ `Daytime/evening attendance\t` <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Previous qualification` <dbl> 1, 1, 1, 1, 1, 19, 1,…
## $ `Previous qualification (grade)` <dbl> 122.0, 160.0, 122.0, …
## $ Nacionality <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `Mother's qualification` <dbl> 19, 1, 37, 38, 37, 37…
## $ `Father's qualification` <dbl> 12, 3, 37, 37, 38, 37…
## $ `Mother's occupation` <dbl> 5, 3, 9, 5, 9, 9, 7, …
## $ `Father's occupation` <dbl> 9, 3, 9, 3, 9, 7, 10,…
## $ `Admission grade` <dbl> 127.3, 142.5, 124.8, …
## $ Displaced <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Educational special needs` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ Debtor <dbl> 0, 0, 0, 0, 0, 1, 0, …
## $ `Tuition fees up to date` <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ Gender <dbl> 1, 1, 1, 0, 0, 1, 0, …
## $ `Scholarship holder` <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ `Age at enrollment` <dbl> 20, 19, 19, 20, 45, 5…
## $ International <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (credited)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (enrolled)` <dbl> 0, 6, 6, 6, 6, 5, 7, …
## $ `Curricular units 1st sem (evaluations)` <dbl> 0, 6, 0, 8, 9, 10, 9,…
## $ `Curricular units 1st sem (approved)` <dbl> 0, 6, 0, 6, 5, 5, 7, …
## $ `Curricular units 1st sem (grade)` <dbl> 0.00000, 14.00000, 0.…
## $ `Curricular units 1st sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (credited)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (enrolled)` <dbl> 0, 6, 6, 6, 6, 5, 8, …
## $ `Curricular units 2nd sem (evaluations)` <dbl> 0, 6, 0, 10, 6, 17, 8…
## $ `Curricular units 2nd sem (approved)` <dbl> 0, 6, 0, 5, 6, 5, 8, …
## $ `Curricular units 2nd sem (grade)` <dbl> 0.00000, 13.66667, 0.…
## $ `Curricular units 2nd sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 5, 0, …
## $ `Unemployment rate` <dbl> 10.8, 13.9, 10.8, 9.4…
## $ `Inflation rate` <dbl> 1.4, -0.3, 1.4, -0.8,…
## $ GDP <dbl> 1.74, 0.79, 1.74, -3.…
## $ Target <chr> "Dropout", "Graduate"…
# Explore the data
head(data)
## # A tibble: 6 × 37
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 1 17 5 171
## 2 1 15 1 9254
## 3 1 1 5 9070
## 4 1 17 2 9773
## 5 2 39 1 8014
## 6 2 39 1 9991
## # ℹ 33 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>,
## # `Tuition fees up to date` <dbl>, Gender <dbl>, …
tail(data)
## # A tibble: 6 × 37
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 1 44 1 9070
## 2 1 1 6 9773
## 3 1 1 2 9773
## 4 1 1 1 9500
## 5 1 1 1 9147
## 6 1 10 1 9773
## # ℹ 33 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>,
## # `Tuition fees up to date` <dbl>, Gender <dbl>, …
skim(data)
| Name | data |
| Number of rows | 4424 |
| Number of columns | 37 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 36 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Target | 0 | 1 | 7 | 8 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Marital status | 0 | 1 | 1.18 | 0.61 | 1.00 | 1.00 | 1.00 | 1.00 | 6.00 | ▇▁▁▁▁ |
| Application mode | 0 | 1 | 18.67 | 17.48 | 1.00 | 1.00 | 17.00 | 39.00 | 57.00 | ▇▅▁▆▁ |
| Application order | 0 | 1 | 1.73 | 1.31 | 0.00 | 1.00 | 1.00 | 2.00 | 9.00 | ▇▂▁▁▁ |
| Course | 0 | 1 | 8856.64 | 2063.57 | 33.00 | 9085.00 | 9238.00 | 9556.00 | 9991.00 | ▁▁▁▁▇ |
| Daytime/evening attendance | 0 | 1 | 0.89 | 0.31 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| Previous qualification | 0 | 1 | 4.58 | 10.22 | 1.00 | 1.00 | 1.00 | 1.00 | 43.00 | ▇▁▁▁▁ |
| Previous qualification (grade) | 0 | 1 | 132.61 | 13.19 | 95.00 | 125.00 | 133.10 | 140.00 | 190.00 | ▁▇▇▁▁ |
| Nacionality | 0 | 1 | 1.87 | 6.91 | 1.00 | 1.00 | 1.00 | 1.00 | 109.00 | ▇▁▁▁▁ |
| Mother’s qualification | 0 | 1 | 19.56 | 15.60 | 1.00 | 2.00 | 19.00 | 37.00 | 44.00 | ▇▁▅▁▇ |
| Father’s qualification | 0 | 1 | 22.28 | 15.34 | 1.00 | 3.00 | 19.00 | 37.00 | 44.00 | ▆▁▅▁▇ |
| Mother’s occupation | 0 | 1 | 10.96 | 26.42 | 0.00 | 4.00 | 5.00 | 9.00 | 194.00 | ▇▁▁▁▁ |
| Father’s occupation | 0 | 1 | 11.03 | 25.26 | 0.00 | 4.00 | 7.00 | 9.00 | 195.00 | ▇▁▁▁▁ |
| Admission grade | 0 | 1 | 126.98 | 14.48 | 95.00 | 117.90 | 126.10 | 134.80 | 190.00 | ▂▇▃▁▁ |
| Displaced | 0 | 1 | 0.55 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| Educational special needs | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| Debtor | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| Tuition fees up to date | 0 | 1 | 0.88 | 0.32 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| Gender | 0 | 1 | 0.35 | 0.48 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| Scholarship holder | 0 | 1 | 0.25 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Age at enrollment | 0 | 1 | 23.27 | 7.59 | 17.00 | 19.00 | 20.00 | 25.00 | 70.00 | ▇▁▁▁▁ |
| International | 0 | 1 | 0.02 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| Curricular units 1st sem (credited) | 0 | 1 | 0.71 | 2.36 | 0.00 | 0.00 | 0.00 | 0.00 | 20.00 | ▇▁▁▁▁ |
| Curricular units 1st sem (enrolled) | 0 | 1 | 6.27 | 2.48 | 0.00 | 5.00 | 6.00 | 7.00 | 26.00 | ▃▇▁▁▁ |
| Curricular units 1st sem (evaluations) | 0 | 1 | 8.30 | 4.18 | 0.00 | 6.00 | 8.00 | 10.00 | 45.00 | ▇▃▁▁▁ |
| Curricular units 1st sem (approved) | 0 | 1 | 4.71 | 3.09 | 0.00 | 3.00 | 5.00 | 6.00 | 26.00 | ▇▆▁▁▁ |
| Curricular units 1st sem (grade) | 0 | 1 | 10.64 | 4.84 | 0.00 | 11.00 | 12.29 | 13.40 | 18.88 | ▂▁▂▇▁ |
| Curricular units 1st sem (without evaluations) | 0 | 1 | 0.14 | 0.69 | 0.00 | 0.00 | 0.00 | 0.00 | 12.00 | ▇▁▁▁▁ |
| Curricular units 2nd sem (credited) | 0 | 1 | 0.54 | 1.92 | 0.00 | 0.00 | 0.00 | 0.00 | 19.00 | ▇▁▁▁▁ |
| Curricular units 2nd sem (enrolled) | 0 | 1 | 6.23 | 2.20 | 0.00 | 5.00 | 6.00 | 7.00 | 23.00 | ▁▇▁▁▁ |
| Curricular units 2nd sem (evaluations) | 0 | 1 | 8.06 | 3.95 | 0.00 | 6.00 | 8.00 | 10.00 | 33.00 | ▃▇▁▁▁ |
| Curricular units 2nd sem (approved) | 0 | 1 | 4.44 | 3.01 | 0.00 | 2.00 | 5.00 | 6.00 | 20.00 | ▆▇▁▁▁ |
| Curricular units 2nd sem (grade) | 0 | 1 | 10.23 | 5.21 | 0.00 | 10.75 | 12.20 | 13.33 | 18.57 | ▂▁▁▇▁ |
| Curricular units 2nd sem (without evaluations) | 0 | 1 | 0.15 | 0.75 | 0.00 | 0.00 | 0.00 | 0.00 | 12.00 | ▇▁▁▁▁ |
| Unemployment rate | 0 | 1 | 11.57 | 2.66 | 7.60 | 9.40 | 11.10 | 13.90 | 16.20 | ▆▆▇▂▅ |
| Inflation rate | 0 | 1 | 1.23 | 1.38 | -0.80 | 0.30 | 1.40 | 2.60 | 3.70 | ▆▇▆▆▃ |
| GDP | 0 | 1 | 0.00 | 2.27 | -4.06 | -1.70 | 0.32 | 1.79 | 3.51 | ▆▂▆▇▅ |
# Frequency of Target variable
table(data$Target)
##
## Dropout Enrolled Graduate
## 1421 794 2209
# Bar plot for the Target variable
barplot(table(data$Target), main = "Distribution of Target Variable")
table(data$`Marital status`)
##
## 1 2 3 4 5 6
## 3919 379 4 91 25 6
table(data$Gender)
##
## 0 1
## 2868 1556
hist(data$`Admission grade`)
hist(data$`Age at enrollment`)
boxplot(`Admission grade` ~ `Marital status`, data = data, col = "lightblue", main = "Admission Grade by Marital Status")
ggplot(data, aes(x = Target, y = `Age at enrollment`, fill = Target)) +
geom_violin(trim = FALSE, scale = "width") +
geom_jitter(position = position_jitter(0.2), alpha = 0.5) +
labs(title = "Distribution of Age at Enrollment by Target",
x = "Target",
y = "Age at Enrollment") +
theme_minimal()
ggplot(data, aes(x = as.factor(`Marital status`), fill = Target)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Marital Status by Target",
x = "Marital Status",
y = "Frequency") +
scale_x_discrete(labels = c("Single", "Married", "Widower", "Divorced", "Facto Union", "Legally Separated")) +
theme_minimal()
# Create a subset of the dataset for ANOVA
data_anova <- data %>%
select("Target", `Curricular units 1st sem (grade)`)
# Rename the columns for clarity
colnames(data_anova) <- c("Target", "Grade")
# Perform ANOVA
anova_result <- aov(Grade ~ Target, data = data_anova)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Target 2 25321 12661 713.5 <2e-16 ***
## Residuals 4421 78447 18
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(anova_result)
ggplot(data, aes(x = factor(Gender), fill = Target)) +
geom_bar(position = "dodge", color = "black") +
labs(title = "Distribution of Dropouts by Gender",
x = "Gender",
y = "Count") +
scale_fill_manual(values = c("Enrolled" = "blue", "Dropout" = "red")) +
theme_minimal()
tukey_result <- TukeyHSD(anova_result)
summary(tukey_result)
## Length Class Mode
## Target 12 -none- numeric
ggplot(data, aes(x = as.factor(`Marital status`), fill = Target)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Marital Status by Target",
x = "Marital Status",
y = "Frequency") +
scale_x_discrete(labels = c("Single", "Married", "Widower", "Divorced", "Facto Union", "Legally Separated")) +
theme_minimal()
##Null Hypothesis 1 (H0): The average age at enrollment for students who dropped out (Target: “Dropout”) is the same as the average age at enrollment for students who graduated (Target: “Graduate”).
##Null Hypothesis 2 (H0): There is no significant difference in the admission grade between students who have educational special needs (Educational special needs: 1) and those who don’t have educational special needs (Educational special needs: 0).
##Hypothesis Test 1:
Alpha Level (α): 0.05 (5% of type 1 error) Power Level: 0.80 Minimum Effect Size (Cohen’s d): 0.50 (medium effect size)
We will perform a two-sample t-test to compare the means of age at enrollment for students who dropped out and those who graduated. Then, we will visualize the results.
# Hypothesis Test 1: Age at Enrollment vs. Target (Dropout vs. Graduate)
# Filter the data for only "Dropout" and "Graduate" levels in the "Target" variable
subset_df <- data[data$Target %in% c("Dropout", "Graduate"), ]
alpha <- 0.05
power <- 0.80
effect_size <- 0.50
result_t_test <- t.test(data$`Age at enrollment`[subset_df$Target == "Dropout"],
data$`Age at enrollment`[subset_df$Target == "Graduate"])
# Visualization
boxplot(`Age at enrollment` ~ Target, data = data, main = "Age at Enrollment by Target",
xlab = "Target", ylab = "Age at Enrollment", col = c("blue", "green"))
legend("topright", legend = c("Dropout", "Graduate"), fill = c("blue", "green"))
# Interpretation of Results
if (result_t_test$p.value < alpha) {
cat("H0 is rejected. There is a significant difference in age at enrollment between Dropout and Graduate students.")
} else {
cat("H0 is not rejected. There is no significant difference in age at enrollment between Dropout and Graduate students.")
}
## H0 is not rejected. There is no significant difference in age at enrollment between Dropout and Graduate students.
##Hypothesis Test 2:
Alpha Level (α): 0.05 Power Level: 0.80 Minimum Effect Size (Cohen’s d): 0.50 (medium effect size) We will perform a two-sample t-test to compare the means of admission grade for students with and without educational special needs.
# Hypothesis Test 2: Admission Grade vs. Educational Special Needs (1 vs. 0)
alpha <- 0.05
power <- 0.80
effect_size <- 0.50
result_t_test <- t.test(data$`Admission grade` ~ data$`Educational special needs`)
# Visualization
boxplot(`Admission grade` ~ `Educational special needs`, data = data, main = "Admission Grade by Educational Special Needs",
xlab = "Educational Special Needs", ylab = "Admission Grade", col = c("blue", "green"))
legend("topright", legend = c("No Educational Special Needs", "With Educational Special Needs"), fill = c("blue", "green"))
# Interpretation of Results
if (result_t_test$p.value < alpha) {
cat("H0 is rejected. There is a significant difference in admission grade between students with and without educational special needs.")
} else {
cat("H0 is not rejected. There is no significant difference in admission grade between students with and without educational special needs.")
}
## H0 is not rejected. There is no significant difference in admission grade between students with and without educational special needs.
##Hypothesis Test 1: Age at Enrollment vs. Target (Dropout vs. Graduate) Result: H0 is not rejected. Interpretation: There is no significant difference in age at enrollment between students who dropped out and those who graduated.
##Hypothesis Test 2: Admission Grade vs. Educational Special Needs (1 vs. 0) Result: H0 is not rejected. Interpretation: There is no significant difference in admission grade between students with educational special needs and those without. These results suggest that, based on the data and the chosen significance level, there is insufficient evidence to conclude that there are significant differences in age at enrollment or admission grade among the specified groups.
##Further Considerations:
While these specific hypotheses did not yield significant results, it’s important to note that the absence of evidence of a significant difference does not necessarily mean that there are no practical or real-world differences. Further research and analyses may be needed to explore other factors that could influence these variables and to understand their implications better.
Additionally, it might be worth considering whether the sample size or the choice of statistical test had an impact on the results. Further investigations or different analytical approaches could provide deeper insights into these variables’ relationships and significance.