Data Manipulation Exercise Answers

Import the two Census Files into R

library(tidyverse)

census_term1 <- read_csv("https://raw.githubusercontent.com/bpattizUCM/MidAIR2017/master/Census%20Term1.csv")
census_term2 <- read_csv("https://raw.githubusercontent.com/bpattizUCM/MidAIR2017/master/Census%20Term2.csv")

# Read Majors and Schools Code File

majors_code_file <- read_csv("https://raw.githubusercontent.com/bpattizUCM/MidAIR2017/master/Major%20Codes.csv") %>% select(-Prop)

Apply the following function to both Census Files to clean up the major codes

CleanUpMajorCodes <- function(dataset){
  
  library(stringr)

  df <- dataset %>%
    mutate(MAJOR_CODE = 
           case_when(str_length(MAJOR_CODE) == 1 ~ str_c("00000", MAJOR_CODE),
                     str_length(MAJOR_CODE) == 2 ~ str_c("0000", MAJOR_CODE),
                     str_length(MAJOR_CODE) == 3 ~ str_c("000", MAJOR_CODE)))
  
  return(df)
  
}

census_term1_tidy_df <- CleanUpMajorCodes(census_term1)
census_term2_tidy_df <- CleanUpMajorCodes(census_term2)

Merge the Majors and Schools Code File to both Census Files

census_term1_tidy_dataframe <-
  census_term1_tidy_df %>%
  left_join(majors_code_file)

census_term2_tidy_dataframe <-
  census_term2_tidy_df %>%
  left_join(majors_code_file)

Calculate the retention rate of all first-time freshmen (where Stype is ‘N’)

census_term1_ftf <- 
census_term1_tidy_dataframe %>%
  filter(Stype == "N")  

mean(census_term1_ftf$ID %in% census_term2_tidy_dataframe$ID)

## [1] 0.7099935

How many first-time freshmen retained to Term 2 but are part-time? (Hours < 12)

census_term2_tidy_df %>% 
  filter(Hours < 12 & (ID %in% census_term1_ftf$ID)) %>%
  summarise(n())

## # A tibble: 1 x 1
##   `n()`
##   <int>
## 1   266

Get the table of students in the first term College of Education students whose ACT scores are higher than 22, HS GPA is above 3.5 and are full time

census_term1_tidy_dataframe %>%
  filter(COLLEGE == "Education" & ACT > 22 & HS_GPA > 3.5 & Hours >= 12)

## # A tibble: 57 x 16
##       ID     Race   Age    Sex HS_GPA   ACT ACT_ENG ACT_MATH ACT_READ
##    <int>    <chr> <int>  <chr>  <dbl> <int>   <int>    <int>    <int>
##  1   121    White    32 Female   3.71    24      24       26       23
##  2   133    Black    20 Female   3.90    23      21       22       20
##  3   278 Hispanic    32   Male   3.78    23      25       24       23
##  4   479    Asian    30 Female   3.71    24      24       21       22
##  5   556    White    32 Female   3.69    32      33       34       30
##  6   619 Hispanic    33   Male   3.59    25      25       24       26
##  7  1029    White    18   Male   3.59    28      28       29       31
##  8  1041    Black    18   Male   3.59    29      27       26       30
##  9  1081    Black    35   Male   3.68    23      24       19       22
## 10  1816    White    18   Male   3.57    23      22       24       22
## # ... with 47 more rows, and 7 more variables: ACT_SCI <int>,
## #   MAJOR_CODE <chr>, Stype <chr>, Hours <dbl>, MAJOR_TITLE <chr>,
## #   COLLEGE <chr>, DEPARTMENT <chr>

Create a new field in the Majors and Schools file that creates codes for each College. Call this variable “COLLEGE_CODE” and encode it as the following: For the Arts, Humanities, and Social Sciences “CAHSS”, Business and Professional Studies “BPS”, Education “ED”, Provost (“PROVOST”), and Undesignated/Open Option (“UNDECIDED”).

majors_code_file_df <-
  majors_code_file %>%
  mutate(COLLEGE_CODE = 
           case_when(COLLEGE == "Arts, Humanities, and Social Sciences" ~ "CAHSS",
                     COLLEGE == "Business and Professional Studies" ~ "BPS",
                     COLLEGE == "Education" ~ "ED",
                     COLLEGE == "Health, Science and Technology" ~ "HST",
                     COLLEGE == "Provost" ~ "PROVOST",
                     COLLEGE == "Undesignated/Open Option" ~ "UNDECIDED"))


head(majors_code_file_df$COLLEGE_CODE)

## [1] "UNDECIDED" "UNDECIDED" "HST"       "ED"        "HST"       "HST"

For first-time freshmen who retained to the second term, find their minimum, maxmum, mean and standard deviation of their ACT and HS GPA

census_term1_ftf %>% 
  filter(ID %in% census_term2_tidy_df$ID) %>% 
  summarise(min(ACT, na.rm = TRUE), max(ACT, na.rm = TRUE), mean(ACT, na.rm = TRUE), sd(ACT, na.rm = TRUE),
            min(HS_GPA), max(HS_GPA), mean(HS_GPA), sd(HS_GPA))

## # A tibble: 1 x 8
##   `min(ACT, na.rm = TRUE)` `max(ACT, na.rm = TRUE)`
##                      <dbl>                    <dbl>
## 1                       12                       34
## # ... with 6 more variables: `mean(ACT, na.rm = TRUE)` <dbl>, `sd(ACT,
## #   na.rm = TRUE)` <dbl>, `min(HS_GPA)` <dbl>, `max(HS_GPA)` <dbl>,
## #   `mean(HS_GPA)` <dbl>, `sd(HS_GPA)` <dbl>

From the Census File for Term 2, compute the counts of students by ethnicity and student type

census_term2_tidy_df %>% 
  group_by(Race, Stype) %>%
  summarise(Count = n())

## # A tibble: 36 x 3
## # Groups:   Race [?]
##         Race Stype Count
##        <chr> <chr> <int>
##  1     Asian     C   179
##  2     Asian     M    51
##  3     Asian     N    89
##  4     Asian     R     5
##  5     Asian     T    60
##  6 Bi-Racial     C    52
##  7 Bi-Racial     M    15
##  8 Bi-Racial     N    38
##  9 Bi-Racial     R     3
## 10 Bi-Racial     T    32
## # ... with 26 more rows

Calculate the Perentage Change in Biology Majors from Census 1 to Census 2 ((N2 - N1)/N2

biology_census1 <-
  census_term1_tidy_dataframe %>%
  filter(MAJOR_TITLE == "Biology")

n_biology_census1 <- nrow(biology_census1)

biology_census2 <-
census_term2_tidy_dataframe %>%
  filter(MAJOR_TITLE == "Biology")

n_biology_census2 <- nrow(biology_census2)


(n_biology_census2 - n_biology_census1)/n_biology_census1

## [1] -0.1628571

Data Manipulation Exercise Answers

Brian Pattiz

2017-11-08