Enrollment Sample Presentation

Import the Excel file provided with this document into a programming language of your choice. You need this step to be able to manipulate/extract the information on the Excel file.

##Load SQL to R ##All original SQL queries were submitted seperately

library(RODBC)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(knitr)

db_conn <- odbcConnect("LocalDSN", rows_at_time = 1)

sql <-"
select*
  from Enrollment_Sample_File"

enrollment <- sqlQuery(db_conn, sql, stringsAsFactors = FALSE)

knitr::opts_chunk$set(echo = TRUE)

##Examine data types and structures, address if loaded properly
str(enrollment)

## 'data.frame':    503454 obs. of  14 variables:
##  $ Dummy_Student_ID: int  1 2 2 2 2 3 3 3 3 3 ...
##  $ CREDITS         : num  3 1 3 3 3 3 3 3 3 3 ...
##  $ ACAD_CAREER     : chr  "UGRD" "UGRD" "UGRD" "UGRD" ...
##  $ ACAD_ORG        : int  200033 650130 650110 650100 300350 250290 250290 250290 250290 650080 ...
##  $ CLASS_NBR       : int  3500 8798 5543 8253 2043 6736 6743 8085 5679 2529 ...
##  $ CRSNAME         : chr  "EEC2271" "LIS2004" "STA2023" "MAN4941" ...
##  $ COURSE_CAMPUS   : chr  "HOME" "VIRTU" "VIRTU" "VIRTU" ...
##  $ CLASS_DESCR     : chr  "Wrkng Yng Ch Spec Nd" "Strategies for Online Research" "Statistical Methods" "Management Internship" ...
##  $ ENROLLMENT_TERM : int  2197 2197 2197 2203 2203 2197 2197 2203 2203 2205 ...
##  $ ACAD_PLAN       : chr  "10809" "P9200" "P9200" "P9200" ...
##  $ PLAN_DESCR      : chr  "Teaching-PreElemen/Early Child" "BAS Supervision & Management" "BAS Supervision & Management" "BAS Supervision & Management" ...
##  $ AGE             : int  51 33 33 34 34 24 24 24 24 25 ...
##  $ SEX             : chr  "F" "F" "F" "F" ...
##  $ ETHNICITY       : chr  "White Non-Hispanic" "Hispanic" "Hispanic" "Hispanic" ...

summary(enrollment)

##  Dummy_Student_ID    CREDITS         ACAD_CAREER           ACAD_ORG     
##  Min.   :     1   Min.   : 0.03333   Length:503454      Min.   :100020  
##  1st Qu.: 28255   1st Qu.: 3.00000   Class :character   1st Qu.:300040  
##  Median : 52479   Median : 3.00000   Mode  :character   Median :350100  
##  Mean   : 50038   Mean   : 2.83116                      Mean   :366852  
##  3rd Qu.: 71946   3rd Qu.: 3.00000                      3rd Qu.:450160  
##  Max.   :101608   Max.   :25.50000                      Max.   :700060  
##                   NA's   :89                                            
##    CLASS_NBR       CRSNAME          COURSE_CAMPUS      CLASS_DESCR       
##  Min.   : 1002   Length:503454      Length:503454      Length:503454     
##  1st Qu.: 3434   Class :character   Class :character   Class :character  
##  Median : 5981   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 6974                                                           
##  3rd Qu.: 9752                                                           
##  Max.   :17317                                                           
##                                                                          
##  ENROLLMENT_TERM  ACAD_PLAN          PLAN_DESCR             AGE       
##  Min.   :2197    Length:503454      Length:503454      Min.   :-1.00  
##  1st Qu.:2197    Class :character   Class :character   1st Qu.:19.00  
##  Median :2203    Mode  :character   Mode  :character   Median :21.00  
##  Mean   :2201                                          Mean   :24.47  
##  3rd Qu.:2203                                          3rd Qu.:27.00  
##  Max.   :2205                                          Max.   :97.00  
##                                                        NA's   :7      
##      SEX             ETHNICITY        
##  Length:503454      Length:503454     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##

mdc <- enrollment
##mdc$Dummy_Student_ID <- as.character(mdc$Dummy_Student_ID)
mdc$ACAD_ORG <- as.character(mdc$ACAD_ORG)
mdc$CLASS_NBR <- as.character(mdc$CLASS_NBR)
mdc$CREDITS <- as.integer(mdc$CREDITS)
str(mdc)

## 'data.frame':    503454 obs. of  14 variables:
##  $ Dummy_Student_ID: int  1 2 2 2 2 3 3 3 3 3 ...
##  $ CREDITS         : int  3 1 3 3 3 3 3 3 3 3 ...
##  $ ACAD_CAREER     : chr  "UGRD" "UGRD" "UGRD" "UGRD" ...
##  $ ACAD_ORG        : chr  "200033" "650130" "650110" "650100" ...
##  $ CLASS_NBR       : chr  "3500" "8798" "5543" "8253" ...
##  $ CRSNAME         : chr  "EEC2271" "LIS2004" "STA2023" "MAN4941" ...
##  $ COURSE_CAMPUS   : chr  "HOME" "VIRTU" "VIRTU" "VIRTU" ...
##  $ CLASS_DESCR     : chr  "Wrkng Yng Ch Spec Nd" "Strategies for Online Research" "Statistical Methods" "Management Internship" ...
##  $ ENROLLMENT_TERM : int  2197 2197 2197 2203 2203 2197 2197 2203 2203 2205 ...
##  $ ACAD_PLAN       : chr  "10809" "P9200" "P9200" "P9200" ...
##  $ PLAN_DESCR      : chr  "Teaching-PreElemen/Early Child" "BAS Supervision & Management" "BAS Supervision & Management" "BAS Supervision & Management" ...
##  $ AGE             : int  51 33 33 34 34 24 24 24 24 25 ...
##  $ SEX             : chr  "F" "F" "F" "F" ...
##  $ ETHNICITY       : chr  "White Non-Hispanic" "Hispanic" "Hispanic" "Hispanic" ...

summary(mdc)

##  Dummy_Student_ID    CREDITS       ACAD_CAREER          ACAD_ORG        
##  Min.   :     1   Min.   : 0.000   Length:503454      Length:503454     
##  1st Qu.: 28255   1st Qu.: 3.000   Class :character   Class :character  
##  Median : 52479   Median : 3.000   Mode  :character   Mode  :character  
##  Mean   : 50038   Mean   : 2.794                                        
##  3rd Qu.: 71946   3rd Qu.: 3.000                                        
##  Max.   :101608   Max.   :25.000                                        
##                   NA's   :89                                            
##   CLASS_NBR           CRSNAME          COURSE_CAMPUS      CLASS_DESCR       
##  Length:503454      Length:503454      Length:503454      Length:503454     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  ENROLLMENT_TERM  ACAD_PLAN          PLAN_DESCR             AGE       
##  Min.   :2197    Length:503454      Length:503454      Min.   :-1.00  
##  1st Qu.:2197    Class :character   Class :character   1st Qu.:19.00  
##  Median :2203    Mode  :character   Mode  :character   Median :21.00  
##  Mean   :2201                                          Mean   :24.47  
##  3rd Qu.:2203                                          3rd Qu.:27.00  
##  Max.   :2205                                          Max.   :97.00  
##                                                        NA's   :7      
##      SEX             ETHNICITY        
##  Length:503454      Length:503454     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##

missmap(mdc, main = "Missing values vs observed")

boxplot(mdc[,c(2,12)])

Find the total number of students enrolled and total number of credits College-wide (this is the sum of all the credits in the file using the credits column. For the total number of students use the Dummy Student ID column).

TotalStudents <- n_distinct(mdc[,c(1)], na.rm = FALSE)
TotalStudents

## [1] 101608

TotalCredits <- sum(mdc[,c(2)], na.rm = TRUE)
TotalCredits

## [1] 1406224

Find the total number of students and total number of credits by Course Campus.

knitr::opts_chunk$set(echo = TRUE)
q3 <- mdc %>%
  select(COURSE_CAMPUS,Dummy_Student_ID,CREDITS) %>%
  group_by(COURSE_CAMPUS) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), TotalCredits = sum(CREDITS, na.rm = TRUE))

knitr::kable(q3,"pipe")

COURSE_CAMPUS	TotalStudents	TotalCredits
HIA	11413	105523
HOME	9493	83468
IAC	12603	102821
KENDL	30824	339116
MEDI	5346	56380
MEEK	2696	12480
NORTH	23311	227963
VIRTU	22717	170275
WEST	9137	81921
WOLF	23848	226277

Summarize other relevant information (information that could be useful in your opinion) in the file for College-wide and by Course Campus.

AvgAge = mean(mdc$AGE, na.rm = TRUE)
AvgAge

## [1] 24.46693

careers <- mdc %>%
  select(ACAD_CAREER,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(ACAD_CAREER) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(careers,"pipe")

ACAD_CAREER	TotalStudents	AvgCredits	AvgAge
UGRD	77971	2.926010	23.23798
CEPD	24342	1.552745	36.65484
PSAV	2495	2.059673	27.16394
CPP	33	2.297170	39.66981

sex <- mdc %>%
  select(SEX,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(SEX) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(sex,"pipe")

SEX	TotalStudents	AvgCredits	AvgAge
F	58688	2.748599	24.94523
M	42830	2.859703	23.77908
U	105	1.537634	23.11892

ethnicity <- mdc %>%
  select(ETHNICITY,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(ETHNICITY) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(ethnicity,"pipe")

ETHNICITY	TotalStudents	AvgCredits	AvgAge
Hispanic	70718	2.856282	23.73142
Black Non-Hispanic	13871	2.820815	25.58571
Unknown	9749	1.710789	31.19204
White Non-Hispanic	5382	2.797926	25.81302
Asian	1073	2.828619	24.15829
TWO OR MORE	763	2.847541	23.65159
American Indian	71	2.804217	24.28614
Pacific Islander	53	2.835165	23.99634

campus <- mdc %>%
  select(COURSE_CAMPUS,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(COURSE_CAMPUS) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(campus,"pipe")

COURSE_CAMPUS	TotalStudents	AvgCredits	AvgAge
KENDL	30824	2.817795	22.91340
WOLF	23848	2.815581	25.60516
NORTH	23311	2.774792	24.50542
VIRTU	22717	2.941202	24.82228
IAC	12603	2.798688	26.43746
HIA	11413	2.937477	23.06865
HOME	9493	2.687747	23.30970
WEST	9137	2.746999	23.48582
MEDI	5346	2.253938	28.82350
MEEK	2696	3.081482	25.93407

chart <- mdc %>%
  select(ENROLLMENT_TERM,ETHNICITY,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(ENROLLMENT_TERM,ETHNICITY) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE),TotalCredits = n_distinct(Dummy_Student_ID, na.rm = TRUE)) %>%
  arrange(ENROLLMENT_TERM)

## `summarise()` has grouped output by 'ENROLLMENT_TERM'. You can override using
## the `.groups` argument.

knitr::kable(chart,"pipe")

ENROLLMENT_TERM	ETHNICITY	TotalStudents	AvgCredits	AvgAge	TotalCredits
2197	American Indian	53	2.831081	24.17568	53
2197	Asian	816	2.780695	24.14203	816
2197	Black Non-Hispanic	10148	2.803882	25.12479	10148
2197	Hispanic	53466	2.818179	23.52855	53466
2197	Pacific Islander	41	2.791304	23.66087	41
2197	TWO OR MORE	561	2.850807	23.17396	561
2197	Unknown	6238	1.566777	31.57535	6238
2197	White Non-Hispanic	3967	2.753427	25.71002	3967
2203	American Indian	51	2.795775	24.06338	51
2203	Asian	747	2.863990	24.06477	747
2203	Black Non-Hispanic	9519	2.846784	25.53170	9519
2203	Hispanic	48877	2.872937	23.73701	48877
2203	Pacific Islander	38	2.876106	23.59292	38
2203	TWO OR MORE	505	2.862248	23.58609	505
2203	Unknown	5032	1.722745	31.48578	5032
2203	White Non-Hispanic	3632	2.830379	25.63575	3632
2205	American Indian	20	2.738095	25.42857	20
2205	Asian	435	2.865672	24.38543	435
2205	Black Non-Hispanic	5380	2.805433	26.65042	5380
2205	Hispanic	26407	2.911757	24.20502	26407
2205	Pacific Islander	21	2.844444	25.86667	21
2205	TWO OR MORE	303	2.810881	24.85622	303
2205	Unknown	1355	2.325690	28.36922	1355
2205	White Non-Hispanic	2043	2.833695	26.42987	2043

Prepare your findings in a 10-15 min presentation format. You will have an opportunity to share and discuss these findings during your second interview.

ggplot(chart, aes(ENROLLMENT_TERM,TotalStudents, colour = ETHNICITY)) +
  geom_line() +
  ggtitle('Ethnic Diveristy by Term')

label <- ethnicity$ETHNICITY
slice <- ethnicity$TotalStudents

pie(slice, labels = label, main = "Ethnic Split")

Enrollment Sample Presentation

Kevin Gonzalez Garcia

9/28/2022