- Import the Excel file provided with this document into a programming language of your choice. You need this step to be able to manipulate/extract the information on the Excel file.
##Load SQL to R ##All original SQL queries were submitted seperately
library(RODBC)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(knitr)
db_conn <- odbcConnect("LocalDSN", rows_at_time = 1)
sql <-"
select*
from Enrollment_Sample_File"
enrollment <- sqlQuery(db_conn, sql, stringsAsFactors = FALSE)
knitr::opts_chunk$set(echo = TRUE)
##Examine data types and structures, address if loaded properly
str(enrollment)
## 'data.frame': 503454 obs. of 14 variables:
## $ Dummy_Student_ID: int 1 2 2 2 2 3 3 3 3 3 ...
## $ CREDITS : num 3 1 3 3 3 3 3 3 3 3 ...
## $ ACAD_CAREER : chr "UGRD" "UGRD" "UGRD" "UGRD" ...
## $ ACAD_ORG : int 200033 650130 650110 650100 300350 250290 250290 250290 250290 650080 ...
## $ CLASS_NBR : int 3500 8798 5543 8253 2043 6736 6743 8085 5679 2529 ...
## $ CRSNAME : chr "EEC2271" "LIS2004" "STA2023" "MAN4941" ...
## $ COURSE_CAMPUS : chr "HOME" "VIRTU" "VIRTU" "VIRTU" ...
## $ CLASS_DESCR : chr "Wrkng Yng Ch Spec Nd" "Strategies for Online Research" "Statistical Methods" "Management Internship" ...
## $ ENROLLMENT_TERM : int 2197 2197 2197 2203 2203 2197 2197 2203 2203 2205 ...
## $ ACAD_PLAN : chr "10809" "P9200" "P9200" "P9200" ...
## $ PLAN_DESCR : chr "Teaching-PreElemen/Early Child" "BAS Supervision & Management" "BAS Supervision & Management" "BAS Supervision & Management" ...
## $ AGE : int 51 33 33 34 34 24 24 24 24 25 ...
## $ SEX : chr "F" "F" "F" "F" ...
## $ ETHNICITY : chr "White Non-Hispanic" "Hispanic" "Hispanic" "Hispanic" ...
summary(enrollment)
## Dummy_Student_ID CREDITS ACAD_CAREER ACAD_ORG
## Min. : 1 Min. : 0.03333 Length:503454 Min. :100020
## 1st Qu.: 28255 1st Qu.: 3.00000 Class :character 1st Qu.:300040
## Median : 52479 Median : 3.00000 Mode :character Median :350100
## Mean : 50038 Mean : 2.83116 Mean :366852
## 3rd Qu.: 71946 3rd Qu.: 3.00000 3rd Qu.:450160
## Max. :101608 Max. :25.50000 Max. :700060
## NA's :89
## CLASS_NBR CRSNAME COURSE_CAMPUS CLASS_DESCR
## Min. : 1002 Length:503454 Length:503454 Length:503454
## 1st Qu.: 3434 Class :character Class :character Class :character
## Median : 5981 Mode :character Mode :character Mode :character
## Mean : 6974
## 3rd Qu.: 9752
## Max. :17317
##
## ENROLLMENT_TERM ACAD_PLAN PLAN_DESCR AGE
## Min. :2197 Length:503454 Length:503454 Min. :-1.00
## 1st Qu.:2197 Class :character Class :character 1st Qu.:19.00
## Median :2203 Mode :character Mode :character Median :21.00
## Mean :2201 Mean :24.47
## 3rd Qu.:2203 3rd Qu.:27.00
## Max. :2205 Max. :97.00
## NA's :7
## SEX ETHNICITY
## Length:503454 Length:503454
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
mdc <- enrollment
##mdc$Dummy_Student_ID <- as.character(mdc$Dummy_Student_ID)
mdc$ACAD_ORG <- as.character(mdc$ACAD_ORG)
mdc$CLASS_NBR <- as.character(mdc$CLASS_NBR)
mdc$CREDITS <- as.integer(mdc$CREDITS)
str(mdc)
## 'data.frame': 503454 obs. of 14 variables:
## $ Dummy_Student_ID: int 1 2 2 2 2 3 3 3 3 3 ...
## $ CREDITS : int 3 1 3 3 3 3 3 3 3 3 ...
## $ ACAD_CAREER : chr "UGRD" "UGRD" "UGRD" "UGRD" ...
## $ ACAD_ORG : chr "200033" "650130" "650110" "650100" ...
## $ CLASS_NBR : chr "3500" "8798" "5543" "8253" ...
## $ CRSNAME : chr "EEC2271" "LIS2004" "STA2023" "MAN4941" ...
## $ COURSE_CAMPUS : chr "HOME" "VIRTU" "VIRTU" "VIRTU" ...
## $ CLASS_DESCR : chr "Wrkng Yng Ch Spec Nd" "Strategies for Online Research" "Statistical Methods" "Management Internship" ...
## $ ENROLLMENT_TERM : int 2197 2197 2197 2203 2203 2197 2197 2203 2203 2205 ...
## $ ACAD_PLAN : chr "10809" "P9200" "P9200" "P9200" ...
## $ PLAN_DESCR : chr "Teaching-PreElemen/Early Child" "BAS Supervision & Management" "BAS Supervision & Management" "BAS Supervision & Management" ...
## $ AGE : int 51 33 33 34 34 24 24 24 24 25 ...
## $ SEX : chr "F" "F" "F" "F" ...
## $ ETHNICITY : chr "White Non-Hispanic" "Hispanic" "Hispanic" "Hispanic" ...
summary(mdc)
## Dummy_Student_ID CREDITS ACAD_CAREER ACAD_ORG
## Min. : 1 Min. : 0.000 Length:503454 Length:503454
## 1st Qu.: 28255 1st Qu.: 3.000 Class :character Class :character
## Median : 52479 Median : 3.000 Mode :character Mode :character
## Mean : 50038 Mean : 2.794
## 3rd Qu.: 71946 3rd Qu.: 3.000
## Max. :101608 Max. :25.000
## NA's :89
## CLASS_NBR CRSNAME COURSE_CAMPUS CLASS_DESCR
## Length:503454 Length:503454 Length:503454 Length:503454
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ENROLLMENT_TERM ACAD_PLAN PLAN_DESCR AGE
## Min. :2197 Length:503454 Length:503454 Min. :-1.00
## 1st Qu.:2197 Class :character Class :character 1st Qu.:19.00
## Median :2203 Mode :character Mode :character Median :21.00
## Mean :2201 Mean :24.47
## 3rd Qu.:2203 3rd Qu.:27.00
## Max. :2205 Max. :97.00
## NA's :7
## SEX ETHNICITY
## Length:503454 Length:503454
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
missmap(mdc, main = "Missing values vs observed")

boxplot(mdc[,c(2,12)])

- Find the total number of students enrolled and total number of credits College-wide (this is the sum of all the credits in the file using the credits column. For the total number of students use the Dummy Student ID column).
TotalStudents <- n_distinct(mdc[,c(1)], na.rm = FALSE)
TotalStudents
## [1] 101608
TotalCredits <- sum(mdc[,c(2)], na.rm = TRUE)
TotalCredits
## [1] 1406224
- Find the total number of students and total number of credits by Course Campus.
knitr::opts_chunk$set(echo = TRUE)
q3 <- mdc %>%
select(COURSE_CAMPUS,Dummy_Student_ID,CREDITS) %>%
group_by(COURSE_CAMPUS) %>%
summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), TotalCredits = sum(CREDITS, na.rm = TRUE))
knitr::kable(q3,"pipe")
| HIA |
11413 |
105523 |
| HOME |
9493 |
83468 |
| IAC |
12603 |
102821 |
| KENDL |
30824 |
339116 |
| MEDI |
5346 |
56380 |
| MEEK |
2696 |
12480 |
| NORTH |
23311 |
227963 |
| VIRTU |
22717 |
170275 |
| WEST |
9137 |
81921 |
| WOLF |
23848 |
226277 |
- Summarize other relevant information (information that could be useful in your opinion) in the file for College-wide and by Course Campus.
AvgAge = mean(mdc$AGE, na.rm = TRUE)
AvgAge
## [1] 24.46693
careers <- mdc %>%
select(ACAD_CAREER,Dummy_Student_ID,CREDITS,AGE) %>%
group_by(ACAD_CAREER) %>%
summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
arrange(desc(TotalStudents))
knitr::kable(careers,"pipe")
| UGRD |
77971 |
2.926010 |
23.23798 |
| CEPD |
24342 |
1.552745 |
36.65484 |
| PSAV |
2495 |
2.059673 |
27.16394 |
| CPP |
33 |
2.297170 |
39.66981 |
sex <- mdc %>%
select(SEX,Dummy_Student_ID,CREDITS,AGE) %>%
group_by(SEX) %>%
summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
arrange(desc(TotalStudents))
knitr::kable(sex,"pipe")
| F |
58688 |
2.748599 |
24.94523 |
| M |
42830 |
2.859703 |
23.77908 |
| U |
105 |
1.537634 |
23.11892 |
ethnicity <- mdc %>%
select(ETHNICITY,Dummy_Student_ID,CREDITS,AGE) %>%
group_by(ETHNICITY) %>%
summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
arrange(desc(TotalStudents))
knitr::kable(ethnicity,"pipe")
| Hispanic |
70718 |
2.856282 |
23.73142 |
| Black Non-Hispanic |
13871 |
2.820815 |
25.58571 |
| Unknown |
9749 |
1.710789 |
31.19204 |
| White Non-Hispanic |
5382 |
2.797926 |
25.81302 |
| Asian |
1073 |
2.828619 |
24.15829 |
| TWO OR MORE |
763 |
2.847541 |
23.65159 |
| American Indian |
71 |
2.804217 |
24.28614 |
| Pacific Islander |
53 |
2.835165 |
23.99634 |
campus <- mdc %>%
select(COURSE_CAMPUS,Dummy_Student_ID,CREDITS,AGE) %>%
group_by(COURSE_CAMPUS) %>%
summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE)) %>%
arrange(desc(TotalStudents))
knitr::kable(campus,"pipe")
| KENDL |
30824 |
2.817795 |
22.91340 |
| WOLF |
23848 |
2.815581 |
25.60516 |
| NORTH |
23311 |
2.774792 |
24.50542 |
| VIRTU |
22717 |
2.941202 |
24.82228 |
| IAC |
12603 |
2.798688 |
26.43746 |
| HIA |
11413 |
2.937477 |
23.06865 |
| HOME |
9493 |
2.687747 |
23.30970 |
| WEST |
9137 |
2.746999 |
23.48582 |
| MEDI |
5346 |
2.253938 |
28.82350 |
| MEEK |
2696 |
3.081482 |
25.93407 |
chart <- mdc %>%
select(ENROLLMENT_TERM,ETHNICITY,Dummy_Student_ID,CREDITS,AGE) %>%
group_by(ENROLLMENT_TERM,ETHNICITY) %>%
summarise(TotalStudents = n_distinct(Dummy_Student_ID, na.rm = FALSE), AvgCredits = mean(CREDITS, na.rm = TRUE), AvgAge = mean(AGE, na.rm = TRUE),TotalCredits = n_distinct(Dummy_Student_ID, na.rm = TRUE)) %>%
arrange(ENROLLMENT_TERM)
## `summarise()` has grouped output by 'ENROLLMENT_TERM'. You can override using
## the `.groups` argument.
knitr::kable(chart,"pipe")
| 2197 |
American Indian |
53 |
2.831081 |
24.17568 |
53 |
| 2197 |
Asian |
816 |
2.780695 |
24.14203 |
816 |
| 2197 |
Black Non-Hispanic |
10148 |
2.803882 |
25.12479 |
10148 |
| 2197 |
Hispanic |
53466 |
2.818179 |
23.52855 |
53466 |
| 2197 |
Pacific Islander |
41 |
2.791304 |
23.66087 |
41 |
| 2197 |
TWO OR MORE |
561 |
2.850807 |
23.17396 |
561 |
| 2197 |
Unknown |
6238 |
1.566777 |
31.57535 |
6238 |
| 2197 |
White Non-Hispanic |
3967 |
2.753427 |
25.71002 |
3967 |
| 2203 |
American Indian |
51 |
2.795775 |
24.06338 |
51 |
| 2203 |
Asian |
747 |
2.863990 |
24.06477 |
747 |
| 2203 |
Black Non-Hispanic |
9519 |
2.846784 |
25.53170 |
9519 |
| 2203 |
Hispanic |
48877 |
2.872937 |
23.73701 |
48877 |
| 2203 |
Pacific Islander |
38 |
2.876106 |
23.59292 |
38 |
| 2203 |
TWO OR MORE |
505 |
2.862248 |
23.58609 |
505 |
| 2203 |
Unknown |
5032 |
1.722745 |
31.48578 |
5032 |
| 2203 |
White Non-Hispanic |
3632 |
2.830379 |
25.63575 |
3632 |
| 2205 |
American Indian |
20 |
2.738095 |
25.42857 |
20 |
| 2205 |
Asian |
435 |
2.865672 |
24.38543 |
435 |
| 2205 |
Black Non-Hispanic |
5380 |
2.805433 |
26.65042 |
5380 |
| 2205 |
Hispanic |
26407 |
2.911757 |
24.20502 |
26407 |
| 2205 |
Pacific Islander |
21 |
2.844444 |
25.86667 |
21 |
| 2205 |
TWO OR MORE |
303 |
2.810881 |
24.85622 |
303 |
| 2205 |
Unknown |
1355 |
2.325690 |
28.36922 |
1355 |
| 2205 |
White Non-Hispanic |
2043 |
2.833695 |
26.42987 |
2043 |
- Prepare your findings in a 10-15 min presentation format. You will have an opportunity to share and discuss these findings during your second interview.
ggplot(chart, aes(ENROLLMENT_TERM,TotalStudents, colour = ETHNICITY)) +
geom_line() +
ggtitle('Ethnic Diveristy by Term')

label <- ethnicity$ETHNICITY
slice <- ethnicity$TotalStudents
pie(slice, labels = label, main = "Ethnic Split")

