library(tidyverse)
library(stringr)
# Read Term 1 and Term 2 Census Files
census_term1 <- read_csv("https://raw.githubusercontent.com/bpattizUCM/MidAIR2017/master/Census%20Term1.csv")
census_term2 <- read_csv("https://raw.githubusercontent.com/bpattizUCM/MidAIR2017/master/Census%20Term2.csv")
# Read Majors and Schools Code File
majors_code_file <- read_csv("https://raw.githubusercontent.com/bpattizUCM/MidAIR2017/master/Major%20Codes.csv") %>% select(-Prop)
head(census_term1)
## # A tibble: 6 x 13
## ID Race Age Sex HS_GPA ACT ACT_ENG ACT_MATH ACT_READ
## <int> <chr> <int> <chr> <dbl> <int> <int> <int> <int>
## 1 1 White 22 Male 3.47 21 22 20 23
## 2 2 Hispanic 28 Female 3.14 19 17 17 16
## 3 3 White 26 Male 3.44 26 27 23 27
## 4 4 Hawaiian 27 Female 3.38 20 20 21 20
## 5 5 Hispanic 27 Female 3.02 21 23 20 22
## 6 6 White 20 Male 3.98 24 26 24 26
## # ... with 4 more variables: ACT_SCI <int>, MAJOR_CODE <int>, Stype <chr>,
## # Hours <dbl>
head(majors_code_file)
## # A tibble: 6 x 4
## MAJOR_CODE MAJOR_TITLE
## <chr> <chr>
## 1 000001 Undesignated
## 2 000002 Graduate Undesignated/Open Option
## 3 000003 Fashion: Textiles & Clothing in Business
## 4 000004 Child & Family Development
## 5 000005 Nursing
## 6 000006 Electronics Technology
## # ... with 2 more variables: COLLEGE <chr>, DEPARTMENT <chr>
str(census_term1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 8935 obs. of 13 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Race : chr "White" "Hispanic" "White" "Hawaiian" ...
## $ Age : int 22 28 26 27 27 20 22 20 28 22 ...
## $ Sex : chr "Male" "Female" "Male" "Female" ...
## $ HS_GPA : num 3.47 3.14 3.44 3.38 3.02 3.98 3.11 3.58 1.71 3.85 ...
## $ ACT : int 21 19 26 20 21 24 22 22 NA NA ...
## $ ACT_ENG : int 22 17 27 20 23 26 24 22 NA NA ...
## $ ACT_MATH : int 20 17 23 21 20 24 22 20 NA NA ...
## $ ACT_READ : int 23 16 27 20 22 26 24 22 NA NA ...
## $ ACT_SCI : int 19 19 27 20 23 21 18 23 NA NA ...
## $ MAJOR_CODE: int 122 30 107 20 74 43 40 114 115 50 ...
## $ Stype : chr "C" "T" "N" "T" ...
## $ Hours : num 14 2 19 12 16 14 16 15 14 15 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 13
## .. ..$ ID : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Race : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Sex : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ HS_GPA : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ ACT : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ ACT_ENG : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ ACT_MATH : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ ACT_READ : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ ACT_SCI : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ MAJOR_CODE: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Stype : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Hours : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
str(majors_code_file)
## Classes 'tbl_df', 'tbl' and 'data.frame': 136 obs. of 4 variables:
## $ MAJOR_CODE : chr "000001" "000002" "000003" "000004" ...
## $ MAJOR_TITLE: chr "Undesignated" "Graduate Undesignated/Open Option" "Fashion: Textiles & Clothing in Business" "Child & Family Development" ...
## $ COLLEGE : chr "Undesignated/Open Option" "Undesignated/Open Option" "Health, Science and Technology" "Education" ...
## $ DEPARTMENT : chr "Undesignated/Open Option" "Undesignated/Open Option" "School of Technology" "Ed Leadership & Human Dev" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 5
## .. ..$ MAJOR_CODE : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Prop : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ MAJOR_TITLE: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ COLLEGE : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ DEPARTMENT : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
nrow(census_term1)
## [1] 8935
Add leading zeros to the MAJOR_CODE field
# Add leading zeros to the MAJOR_CODE field
# Tidyverse Appraoch
CleanUpMajorCodes <- function(dataset){
df <- dataset %>%
mutate(MAJOR_CODE =
case_when(str_length(MAJOR_CODE) == 1 ~ str_c("00000", MAJOR_CODE),
str_length(MAJOR_CODE) == 2 ~ str_c("0000", MAJOR_CODE),
str_length(MAJOR_CODE) == 3 ~ str_c("000", MAJOR_CODE)))
return(df)
}
census_term1_tidy_df <- CleanUpMajorCodes(census_term1)
head(census_term1_tidy_df$MAJOR_CODE)
## [1] "000122" "000030" "000107" "000020" "000074" "000043"
# Base R Approach
census_term1$MAJOR_CODE <- as.character(census_term1$MAJOR_CODE)
for(i in 1:length(census_term1)){
if(str_length(census_term1$MAJOR_CODE[i]) == 1){
census_term1$MAJOR_CODE[i] <- str_c("00000", census_term1$MAJOR_CODE[i])
}
else if(str_length(census_term1$MAJOR_CODE[i]) == 2){
census_term1$MAJOR_CODE[i] <- str_c("0000", census_term1$MAJOR_CODE[i])
}
else
census_term1$MAJOR_CODE[i] <- str_c("000", census_term1$MAJOR_CODE[i])
}
head(census_term1$MAJOR_CODE)
## [1] "000122" "000030" "000107" "000020" "000074" "000043"
Filter data to find New First-Time Freshmen who have an ACT greater than or equal to 25
# Filter data to find New First-Time Freshmen who have an ACT greater than or equal to 25
# Tidyverse Approach
census_term1 %>% filter(Stype == "N" & ACT >= 25)
## # A tibble: 270 x 13
## ID Race Age Sex HS_GPA ACT ACT_ENG ACT_MATH ACT_READ
## <int> <chr> <int> <chr> <dbl> <int> <int> <int> <int>
## 1 3 White 26 Male 3.44 26 27 23 27
## 2 34 White 26 Female 2.75 27 26 26 26
## 3 36 White 26 Female 3.42 25 27 27 25
## 4 70 Hispanic 33 Female 2.57 26 28 26 26
## 5 97 White 34 Female 1.87 25 26 24 25
## 6 167 White 30 Female 2.86 26 25 25 26
## 7 204 White 33 Female 3.92 30 32 29 29
## 8 237 White 33 Male 3.30 27 27 26 27
## 9 246 White 29 Male 2.65 26 26 28 26
## 10 271 White 31 Male 3.76 31 33 32 28
## # ... with 260 more rows, and 4 more variables: ACT_SCI <int>,
## # MAJOR_CODE <chr>, Stype <chr>, Hours <dbl>
# Base R Approach with subset function
subset(census_term1, Stype == "N" & ACT >= 25)
## # A tibble: 270 x 13
## ID Race Age Sex HS_GPA ACT ACT_ENG ACT_MATH ACT_READ
## <int> <chr> <int> <chr> <dbl> <int> <int> <int> <int>
## 1 3 White 26 Male 3.44 26 27 23 27
## 2 34 White 26 Female 2.75 27 26 26 26
## 3 36 White 26 Female 3.42 25 27 27 25
## 4 70 Hispanic 33 Female 2.57 26 28 26 26
## 5 97 White 34 Female 1.87 25 26 24 25
## 6 167 White 30 Female 2.86 26 25 25 26
## 7 204 White 33 Female 3.92 30 32 29 29
## 8 237 White 33 Male 3.30 27 27 26 27
## 9 246 White 29 Male 2.65 26 26 28 26
## 10 271 White 31 Male 3.76 31 33 32 28
## # ... with 260 more rows, and 4 more variables: ACT_SCI <int>,
## # MAJOR_CODE <chr>, Stype <chr>, Hours <dbl>
# Base R Approach with Dataframe Indexing
census_term1[census_term1$Stype == "N" & census_term1$ACT >= 25 & !is.na(census_term1$ACT),]
## # A tibble: 270 x 13
## ID Race Age Sex HS_GPA ACT ACT_ENG ACT_MATH ACT_READ
## <int> <chr> <int> <chr> <dbl> <int> <int> <int> <int>
## 1 3 White 26 Male 3.44 26 27 23 27
## 2 34 White 26 Female 2.75 27 26 26 26
## 3 36 White 26 Female 3.42 25 27 27 25
## 4 70 Hispanic 33 Female 2.57 26 28 26 26
## 5 97 White 34 Female 1.87 25 26 24 25
## 6 167 White 30 Female 2.86 26 25 25 26
## 7 204 White 33 Female 3.92 30 32 29 29
## 8 237 White 33 Male 3.30 27 27 26 27
## 9 246 White 29 Male 2.65 26 26 28 26
## 10 271 White 31 Male 3.76 31 33 32 28
## # ... with 260 more rows, and 4 more variables: ACT_SCI <int>,
## # MAJOR_CODE <chr>, Stype <chr>, Hours <dbl>
Merge data from Majors and Schools Code File
# Merge data from Majors and Schools Code File
# Tidyverse Approach
census_term1_tidy_dataframe <-
census_term1_tidy_df %>%
left_join(majors_code_file)
head(census_term1_tidy_dataframe %>% select(ID, MAJOR_CODE, MAJOR_TITLE:DEPARTMENT))
## # A tibble: 6 x 5
## ID MAJOR_CODE MAJOR_TITLE
## <int> <chr> <chr>
## 1 1 000122 General Studies
## 2 2 000030 Engineering Technology
## 3 3 000107 Criminal Justice
## 4 4 000020 Finance
## 5 5 000074 Design & Drafting Technology
## 6 6 000043 Health Studies
## # ... with 2 more variables: COLLEGE <chr>, DEPARTMENT <chr>
# Base R Approach
census_term1_dataframe <- merge(census_term1, majors_code_file, by = "MAJOR_CODE")
head(census_term1_dataframe)
## MAJOR_CODE ID Race Age Sex HS_GPA ACT ACT_ENG ACT_MATH ACT_READ
## 1 000005 12 Black 18 Male 3.82 18 19 19 20
## 2 000011 11 Black 20 Male 2.65 NA NA NA NA
## 3 000020 4 Hawaiian 27 Female 3.38 20 20 21 20
## 4 000030 2 Hispanic 28 Female 3.14 19 17 17 16
## 5 000040 7 Black 22 Male 3.11 22 24 22 24
## 6 000043 6 White 20 Male 3.98 24 26 24 26
## ACT_SCI Stype Hours MAJOR_TITLE
## 1 20 N 12 Nursing
## 2 NA N 12 Occupational Education
## 3 20 T 12 Finance
## 4 19 T 2 Engineering Technology
## 5 18 N 16 Graphic Design
## 6 21 C 14 Health Studies
## COLLEGE
## 1 Health, Science and Technology
## 2 Education
## 3 Business and Professional Studies
## 4 Health, Science and Technology
## 5 Arts, Humanities, and Social Sciences
## 6 Health, Science and Technology
## DEPARTMENT
## 1 Nursing
## 2 Middle, Secondary and Adult Education
## 3 Economics and Finance and the School of Accountancy
## 4 School of Technology
## 5 Art and Design
## 6 Nutrition and Kinesiology
Find the Counts by Major and Ethnicity
# Find the Counts by Major and Ethnicity
# Tidyverse Approach
census_term1_tidy_dataframe %>%
group_by(Race, MAJOR_TITLE) %>%
summarise(n())
## # A tibble: 478 x 3
## # Groups: Race [?]
## Race MAJOR_TITLE `n()`
## <chr> <chr> <int>
## 1 Asian Accountancy 5
## 2 Asian Actuarial Science & Statistics 1
## 3 Asian Agricultural Science 6
## 4 Asian Anthropology 1
## 5 Asian Art 1
## 6 Asian Athletic Training 3
## 7 Asian Automotive Technology Management 1
## 8 Asian Aviation Management 2
## 9 Asian Biology 16
## 10 Asian Career & Technology Teacher Ed 1
## # ... with 468 more rows
Find the average ACT Score for Biology Majors
# Tidyverse Approach
census_term1_tidy_dataframe %>%
filter(MAJOR_TITLE == "Biology") %>%
summarise(AVG_ACT = mean(ACT, na.rm = TRUE))
## # A tibble: 1 x 1
## AVG_ACT
## <dbl>
## 1 22.28016
# Base R
mean(subset(census_term1_tidy_dataframe, MAJOR_TITLE == "Biology")$ACT, na.rm = TRUE)
## [1] 22.28016
Find the Returning First-Time Freshmen Nursing Majors from the First Term
ftf_returning_nursing_majors <-
census_term1_tidy_dataframe %>%
filter(Stype == "N" & MAJOR_TITLE == "Nursing" & ID %in% census_term2$ID)
head(ftf_returning_nursing_majors)
## # A tibble: 6 x 16
## ID Race Age Sex HS_GPA ACT ACT_ENG ACT_MATH ACT_READ
## <int> <chr> <int> <chr> <dbl> <int> <int> <int> <int>
## 1 12 Black 18 Male 3.82 18 19 19 20
## 2 34 White 26 Female 2.75 27 26 26 26
## 3 148 White 25 Male 2.98 NA NA NA NA
## 4 201 White 21 Male 2.11 20 20 19 19
## 5 607 Asian 34 Female 3.59 NA NA NA NA
## 6 663 Hispanic 21 Male 3.64 19 19 18 20
## # ... with 7 more variables: ACT_SCI <int>, MAJOR_CODE <chr>, Stype <chr>,
## # Hours <dbl>, MAJOR_TITLE <chr>, COLLEGE <chr>, DEPARTMENT <chr>