1. Import the Excel file provided with this document into a programming language of your choice. You need this step to be able to manipulate/extract the information on the Excel file.

##Load SQL to R ##All original SQL queries were submitted seperately

library(RODBC)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(knitr)

db_conn <- odbcConnect("LocalDSN", rows_at_time = 1)

sql <-"
select*
  from Enrollment_Sample_File"

enrollment <- sqlQuery(db_conn, sql, stringsAsFactors = FALSE)
knitr::opts_chunk$set(echo = TRUE)
##Examine data types and structures, address if loaded properly
str(enrollment)
## 'data.frame':    503454 obs. of  14 variables:
##  $ Dummy_Student_ID: int  1 2 2 2 2 3 3 3 3 3 ...
##  $ CREDITS         : num  3 1 3 3 3 3 3 3 3 3 ...
##  $ ACAD_CAREER     : chr  "UGRD" "UGRD" "UGRD" "UGRD" ...
##  $ ACAD_ORG        : int  200033 650130 650110 650100 300350 250290 250290 250290 250290 650080 ...
##  $ CLASS_NBR       : int  3500 8798 5543 8253 2043 6736 6743 8085 5679 2529 ...
##  $ CRSNAME         : chr  "EEC2271" "LIS2004" "STA2023" "MAN4941" ...
##  $ COURSE_CAMPUS   : chr  "HOME" "VIRTU" "VIRTU" "VIRTU" ...
##  $ CLASS_DESCR     : chr  "Wrkng Yng Ch Spec Nd" "Strategies for Online Research" "Statistical Methods" "Management Internship" ...
##  $ ENROLLMENT_TERM : int  2197 2197 2197 2203 2203 2197 2197 2203 2203 2205 ...
##  $ ACAD_PLAN       : chr  "10809" "P9200" "P9200" "P9200" ...
##  $ PLAN_DESCR      : chr  "Teaching-PreElemen/Early Child" "BAS Supervision & Management" "BAS Supervision & Management" "BAS Supervision & Management" ...
##  $ AGE             : int  51 33 33 34 34 24 24 24 24 25 ...
##  $ SEX             : chr  "F" "F" "F" "F" ...
##  $ ETHNICITY       : chr  "White Non-Hispanic" "Hispanic" "Hispanic" "Hispanic" ...
summary(enrollment)
##  Dummy_Student_ID    CREDITS         ACAD_CAREER           ACAD_ORG     
##  Min.   :     1   Min.   : 0.03333   Length:503454      Min.   :100020  
##  1st Qu.: 28255   1st Qu.: 3.00000   Class :character   1st Qu.:300040  
##  Median : 52479   Median : 3.00000   Mode  :character   Median :350100  
##  Mean   : 50038   Mean   : 2.83116                      Mean   :366852  
##  3rd Qu.: 71946   3rd Qu.: 3.00000                      3rd Qu.:450160  
##  Max.   :101608   Max.   :25.50000                      Max.   :700060  
##                   NA's   :89                                            
##    CLASS_NBR       CRSNAME          COURSE_CAMPUS      CLASS_DESCR       
##  Min.   : 1002   Length:503454      Length:503454      Length:503454     
##  1st Qu.: 3434   Class :character   Class :character   Class :character  
##  Median : 5981   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 6974                                                           
##  3rd Qu.: 9752                                                           
##  Max.   :17317                                                           
##                                                                          
##  ENROLLMENT_TERM  ACAD_PLAN          PLAN_DESCR             AGE       
##  Min.   :2197    Length:503454      Length:503454      Min.   :-1.00  
##  1st Qu.:2197    Class :character   Class :character   1st Qu.:19.00  
##  Median :2203    Mode  :character   Mode  :character   Median :21.00  
##  Mean   :2201                                          Mean   :24.47  
##  3rd Qu.:2203                                          3rd Qu.:27.00  
##  Max.   :2205                                          Max.   :97.00  
##                                                        NA's   :7      
##      SEX             ETHNICITY        
##  Length:503454      Length:503454     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
mdc <- enrollment
##mdc$Dummy_Student_ID <- as.character(mdc$Dummy_Student_ID)
mdc$ACAD_ORG <- as.character(mdc$ACAD_ORG)
mdc$CLASS_NBR <- as.character(mdc$CLASS_NBR)
mdc$CREDITS <- as.integer(mdc$CREDITS)
str(mdc)
## 'data.frame':    503454 obs. of  14 variables:
##  $ Dummy_Student_ID: int  1 2 2 2 2 3 3 3 3 3 ...
##  $ CREDITS         : int  3 1 3 3 3 3 3 3 3 3 ...
##  $ ACAD_CAREER     : chr  "UGRD" "UGRD" "UGRD" "UGRD" ...
##  $ ACAD_ORG        : chr  "200033" "650130" "650110" "650100" ...
##  $ CLASS_NBR       : chr  "3500" "8798" "5543" "8253" ...
##  $ CRSNAME         : chr  "EEC2271" "LIS2004" "STA2023" "MAN4941" ...
##  $ COURSE_CAMPUS   : chr  "HOME" "VIRTU" "VIRTU" "VIRTU" ...
##  $ CLASS_DESCR     : chr  "Wrkng Yng Ch Spec Nd" "Strategies for Online Research" "Statistical Methods" "Management Internship" ...
##  $ ENROLLMENT_TERM : int  2197 2197 2197 2203 2203 2197 2197 2203 2203 2205 ...
##  $ ACAD_PLAN       : chr  "10809" "P9200" "P9200" "P9200" ...
##  $ PLAN_DESCR      : chr  "Teaching-PreElemen/Early Child" "BAS Supervision & Management" "BAS Supervision & Management" "BAS Supervision & Management" ...
##  $ AGE             : int  51 33 33 34 34 24 24 24 24 25 ...
##  $ SEX             : chr  "F" "F" "F" "F" ...
##  $ ETHNICITY       : chr  "White Non-Hispanic" "Hispanic" "Hispanic" "Hispanic" ...
summary(mdc)
##  Dummy_Student_ID    CREDITS       ACAD_CAREER          ACAD_ORG        
##  Min.   :     1   Min.   : 0.000   Length:503454      Length:503454     
##  1st Qu.: 28255   1st Qu.: 3.000   Class :character   Class :character  
##  Median : 52479   Median : 3.000   Mode  :character   Mode  :character  
##  Mean   : 50038   Mean   : 2.794                                        
##  3rd Qu.: 71946   3rd Qu.: 3.000                                        
##  Max.   :101608   Max.   :25.000                                        
##                   NA's   :89                                            
##   CLASS_NBR           CRSNAME          COURSE_CAMPUS      CLASS_DESCR       
##  Length:503454      Length:503454      Length:503454      Length:503454     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  ENROLLMENT_TERM  ACAD_PLAN          PLAN_DESCR             AGE       
##  Min.   :2197    Length:503454      Length:503454      Min.   :-1.00  
##  1st Qu.:2197    Class :character   Class :character   1st Qu.:19.00  
##  Median :2203    Mode  :character   Mode  :character   Median :21.00  
##  Mean   :2201                                          Mean   :24.47  
##  3rd Qu.:2203                                          3rd Qu.:27.00  
##  Max.   :2205                                          Max.   :97.00  
##                                                        NA's   :7      
##      SEX             ETHNICITY        
##  Length:503454      Length:503454     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
missmap(mdc, main = "Missing values vs observed")

boxplot(mdc[,c(2,12)])

  1. Find the total number of students enrolled and total number of credits College-wide (this is the sum of all the credits in the file using the credits column. For the total number of students use the Dummy Student ID column).
TotalStudents <- n_distinct(mdc[,c(1)], na.rm = FALSE)
TotalStudents
## [1] 101608
TotalCredits <- sum(mdc[,c(2)], na.rm = TRUE)
TotalCredits
## [1] 1406224
  1. Find the total number of students and total number of credits by Course Campus.
knitr::opts_chunk$set(echo = TRUE)
q3 <- mdc %>%
  select(COURSE_CAMPUS,Dummy_Student_ID,CREDITS) %>%
  group_by(COURSE_CAMPUS) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), TotalCredits = sum(CREDITS, na.rm = TRUE))

knitr::kable(q3,"pipe")
COURSE_CAMPUS TotalStudents TotalCredits
HIA 11413 105523
HOME 9493 83468
IAC 12603 102821
KENDL 30824 339116
MEDI 5346 56380
MEEK 2696 12480
NORTH 23311 227963
VIRTU 22717 170275
WEST 9137 81921
WOLF 23848 226277
  1. Summarize other relevant information (information that could be useful in your opinion) in the file for College-wide and by Course Campus.
AvgAge = mean(mdc$AGE, na.rm = TRUE)
AvgAge
## [1] 24.46693
careers <- mdc %>%
  select(ACAD_CAREER,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(ACAD_CAREER) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(careers,"pipe")
ACAD_CAREER TotalStudents AvgCredits AvgAge
UGRD 77971 2.926010 23.23798
CEPD 24342 1.552745 36.65484
PSAV 2495 2.059673 27.16394
CPP 33 2.297170 39.66981
sex <- mdc %>%
  select(SEX,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(SEX) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(sex,"pipe")
SEX TotalStudents AvgCredits AvgAge
F 58688 2.748599 24.94523
M 42830 2.859703 23.77908
U 105 1.537634 23.11892
ethnicity <- mdc %>%
  select(ETHNICITY,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(ETHNICITY) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(ethnicity,"pipe")
ETHNICITY TotalStudents AvgCredits AvgAge
Hispanic 70718 2.856282 23.73142
Black Non-Hispanic 13871 2.820815 25.58571
Unknown 9749 1.710789 31.19204
White Non-Hispanic 5382 2.797926 25.81302
Asian 1073 2.828619 24.15829
TWO OR MORE 763 2.847541 23.65159
American Indian 71 2.804217 24.28614
Pacific Islander 53 2.835165 23.99634
campus <- mdc %>%
  select(COURSE_CAMPUS,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(COURSE_CAMPUS) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
  arrange(desc(TotalStudents))

knitr::kable(campus,"pipe")
COURSE_CAMPUS TotalStudents AvgCredits AvgAge
KENDL 30824 2.817795 22.91340
WOLF 23848 2.815581 25.60516
NORTH 23311 2.774792 24.50542
VIRTU 22717 2.941202 24.82228
IAC 12603 2.798688 26.43746
HIA 11413 2.937477 23.06865
HOME 9493 2.687747 23.30970
WEST 9137 2.746999 23.48582
MEDI 5346 2.253938 28.82350
MEEK 2696 3.081482 25.93407
chart <- mdc %>%
  select(ENROLLMENT_TERM,ETHNICITY,Dummy_Student_ID,CREDITS,AGE) %>%
  group_by(ENROLLMENT_TERM,ETHNICITY) %>%
  summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE),TotalCredits = n_distinct(Dummy_Student_ID, na.rm = TRUE)) %>%
  arrange(ENROLLMENT_TERM)
## `summarise()` has grouped output by 'ENROLLMENT_TERM'. You can override using
## the `.groups` argument.
knitr::kable(chart,"pipe")
ENROLLMENT_TERM ETHNICITY TotalStudents AvgCredits AvgAge TotalCredits
2197 American Indian 53 2.831081 24.17568 53
2197 Asian 816 2.780695 24.14203 816
2197 Black Non-Hispanic 10148 2.803882 25.12479 10148
2197 Hispanic 53466 2.818179 23.52855 53466
2197 Pacific Islander 41 2.791304 23.66087 41
2197 TWO OR MORE 561 2.850807 23.17396 561
2197 Unknown 6238 1.566777 31.57535 6238
2197 White Non-Hispanic 3967 2.753427 25.71002 3967
2203 American Indian 51 2.795775 24.06338 51
2203 Asian 747 2.863990 24.06477 747
2203 Black Non-Hispanic 9519 2.846784 25.53170 9519
2203 Hispanic 48877 2.872937 23.73701 48877
2203 Pacific Islander 38 2.876106 23.59292 38
2203 TWO OR MORE 505 2.862248 23.58609 505
2203 Unknown 5032 1.722745 31.48578 5032
2203 White Non-Hispanic 3632 2.830379 25.63575 3632
2205 American Indian 20 2.738095 25.42857 20
2205 Asian 435 2.865672 24.38543 435
2205 Black Non-Hispanic 5380 2.805433 26.65042 5380
2205 Hispanic 26407 2.911757 24.20502 26407
2205 Pacific Islander 21 2.844444 25.86667 21
2205 TWO OR MORE 303 2.810881 24.85622 303
2205 Unknown 1355 2.325690 28.36922 1355
2205 White Non-Hispanic 2043 2.833695 26.42987 2043
  1. Prepare your findings in a 10-15 min presentation format. You will have an opportunity to share and discuss these findings during your second interview.
ggplot(chart, aes(ENROLLMENT_TERM,TotalStudents, colour = ETHNICITY)) +
  geom_line() +
  ggtitle('Ethnic Diveristy by Term')

label <- ethnicity$ETHNICITY
slice <- ethnicity$TotalStudents

pie(slice, labels = label, main = "Ethnic Split")