Description

Objective

Introduction

Fields used:

Hypotheses (ANOVA style):

Packages and data load

# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2) 
# Set working directory
setwd("~/Downloads/25_Semesters/Fall/DATA101")

# Load the dataset
MC_Data <- read_csv("Montgomery_College_Enrollment_Data.csv")
## Rows: 25320 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Student Type, Student Status, Gender, Ethnicity, Race, Attending G...
## dbl  (2): Fall Term, ZIP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create a working copy
df <- MC_Data
# Examine the structure of the dataset
str(df)
## spc_tbl_ [25,320 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Fall Term               : num [1:25320] 2015 2015 2015 2015 2015 ...
##  $ Student Type            : chr [1:25320] "Continuing" "Continuing" "Continuing" "New" ...
##  $ Student Status          : chr [1:25320] "Full-Time" "Part-Time" "Part-Time" "Full-Time" ...
##  $ Gender                  : chr [1:25320] "Female" "Male" "Male" "Male" ...
##  $ Ethnicity               : chr [1:25320] "Not Hispanic" "Not Hispanic" "Not Hispanic" "Not Hispanic" ...
##  $ Race                    : chr [1:25320] "White" "White" "Black" "Asian" ...
##  $ Attending Germantown    : chr [1:25320] "Yes" "No" "No" "No" ...
##  $ Attending Rockville     : chr [1:25320] "Yes" "Yes" "Yes" "Yes" ...
##  $ Attending Takoma Park/SS: chr [1:25320] "No" "No" "No" "No" ...
##  $ Attend Day or Evening   : chr [1:25320] "Day Only" "Evening Only" "Day & Evening" "Day Only" ...
##  $ MC Program Description  : chr [1:25320] "Health Sciences (Pre-Clinical Studies)" "Building Trades Technology (AA & AAS)" "Computer Gaming & Simulation (AA - All Tracks)" "Graphic Design (AA, AAS, & AFA - All Tracks)" ...
##  $ Age Group               : chr [1:25320] "25 - 29" "21 - 24" "20 or Younger" "20 or Younger" ...
##  $ HS Category             : chr [1:25320] "Foreign Country" "MCPS" "MCPS" "MCPS" ...
##  $ MCPS High School        : chr [1:25320] NA "Sherwood High School" "Quince Orchard Sr High School" "Thomas Sprigg Wootton High Sch" ...
##  $ City in MD              : chr [1:25320] "Bethesda" "Olney" "Gaithersburg" "North Potomac" ...
##  $ State                   : chr [1:25320] "MD" "MD" "MD" "MD" ...
##  $ ZIP                     : num [1:25320] 20816 20832 20877 20878 20906 ...
##  $ County in MD            : chr [1:25320] "Montgomery" "Montgomery" "Montgomery" "Montgomery" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Fall Term` = col_double(),
##   ..   `Student Type` = col_character(),
##   ..   `Student Status` = col_character(),
##   ..   Gender = col_character(),
##   ..   Ethnicity = col_character(),
##   ..   Race = col_character(),
##   ..   `Attending Germantown` = col_character(),
##   ..   `Attending Rockville` = col_character(),
##   ..   `Attending Takoma Park/SS` = col_character(),
##   ..   `Attend Day or Evening` = col_character(),
##   ..   `MC Program Description` = col_character(),
##   ..   `Age Group` = col_character(),
##   ..   `HS Category` = col_character(),
##   ..   `MCPS High School` = col_character(),
##   ..   `City in MD` = col_character(),
##   ..   State = col_character(),
##   ..   ZIP = col_double(),
##   ..   `County in MD` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Display column names
names(df)
##  [1] "Fall Term"                "Student Type"            
##  [3] "Student Status"           "Gender"                  
##  [5] "Ethnicity"                "Race"                    
##  [7] "Attending Germantown"     "Attending Rockville"     
##  [9] "Attending Takoma Park/SS" "Attend Day or Evening"   
## [11] "MC Program Description"   "Age Group"               
## [13] "HS Category"              "MCPS High School"        
## [15] "City in MD"               "State"                   
## [17] "ZIP"                      "County in MD"
# Display the first and last few rows
head(df, 5)
## # A tibble: 5 × 18
##   `Fall Term` `Student Type` `Student Status` Gender Ethnicity    Race 
##         <dbl> <chr>          <chr>            <chr>  <chr>        <chr>
## 1        2015 Continuing     Full-Time        Female Not Hispanic White
## 2        2015 Continuing     Part-Time        Male   Not Hispanic White
## 3        2015 Continuing     Part-Time        Male   Not Hispanic Black
## 4        2015 New            Full-Time        Male   Not Hispanic Asian
## 5        2015 New            Full-Time        Female Hispanic     White
## # ℹ 12 more variables: `Attending Germantown` <chr>,
## #   `Attending Rockville` <chr>, `Attending Takoma Park/SS` <chr>,
## #   `Attend Day or Evening` <chr>, `MC Program Description` <chr>,
## #   `Age Group` <chr>, `HS Category` <chr>, `MCPS High School` <chr>,
## #   `City in MD` <chr>, State <chr>, ZIP <dbl>, `County in MD` <chr>
tail(df, 5)
## # A tibble: 5 × 18
##   `Fall Term` `Student Type` `Student Status` Gender Ethnicity    Race 
##         <dbl> <chr>          <chr>            <chr>  <chr>        <chr>
## 1        2015 HS Student     Part-Time        Female Not Hispanic Black
## 2        2015 Continuing     Full-Time        Male   Not Hispanic Asian
## 3        2015 New            Full-Time        Male   Not Hispanic White
## 4        2015 Continuing     Full-Time        Male   Hispanic     Black
## 5        2015 HS Student     Part-Time        Male   Not Hispanic White
## # ℹ 12 more variables: `Attending Germantown` <chr>,
## #   `Attending Rockville` <chr>, `Attending Takoma Park/SS` <chr>,
## #   `Attend Day or Evening` <chr>, `MC Program Description` <chr>,
## #   `Age Group` <chr>, `HS Category` <chr>, `MCPS High School` <chr>,
## #   `City in MD` <chr>, State <chr>, ZIP <dbl>, `County in MD` <chr>

Data Cleaning

# Keep only relevant columns and remove missing values

df2 <-df |>
  filter(!is.na(Gender),
         !is.na(`Age Group`),
         !is.na(`Attending Rockville`),
         `Attending Rockville` == "Yes",
         `Age Group` == "21 - 24",
         `Gender` %in% c("Male", "Female") )|>
   select(Gender, `Age Group`, `Attending Rockville`) |>
  group_by(Gender) |>
  summarise(
    count_of_students = n()
  ) |>
  arrange(desc(count_of_students))

df2
## # A tibble: 2 × 2
##   Gender count_of_students
##   <chr>              <int>
## 1 Male                2295
## 2 Female              1988
# Observed counts
observed <- df2$count_of_students

# Null hypothesis: equal proportion of males and females
theoretical_prop <- c(0.5, 0.5)

# Expected values
expected_values <- round(theoretical_prop * sum(observed),0)
expected_values
## [1] 2142 2142

Results

# Chi-Square Test
chi_result <- chisq.test(observed)
chi_result
## 
##  Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 22.005, df = 1, p-value = 2.719e-06

Visualization

ggplot(df2, aes(x = Gender, y = count_of_students, fill = Gender)) +
  geom_col(color = "black", width = 0.6) +
  scale_fill_manual(values = c("Male" = "#1f77b4", "Female" = "#EF94FB")) +
  labs(title = "Students (Age 21-24) at Rockville Campus in 2015",
       x = "Gender", 
       y = "Number of Students") +
  theme_minimal()

Conclusion