Paradigms and Packages for Data Manipulation

Data Subsetting/Filtering

Goals

  • Select columns of interest
  • Filter observations based certain values
    • Use operators: <, <=, >, >=, ==, %in%, &, ect.

base R

  • Dataframes can be treated like matricies, using row and column indicies
  • Dataframe columns can be selected using the $ operator
  • Simplyfing subsetting: Returns the simplest data structure that can represent output (e.g, a dataframe reduces to a single vector)
  • Preserving subsetting: Keeps the structure of the output the same.
df[row_index, col_index]

df[, df$var]

df[row_sub, col_sub, drop = FALSE]
  • df: The name of the data.frame
  • row_index: integer of the rows to subset
  • col_index: integer of the columns to subset
  • row_sub: vector specifying the subset of rows
  • col_sub: vector specyfing the subset of columns
  • drop = FALSE: Preserves the structure
# Row subsetting example

# In 2007, find the sum of all federal Pell grants and federal loans for full-time, female, age 17-20, Hispanic Colorado residents

library(jsonlite)

colorado_fin_aid_data<- fromJSON('https://data.colorado.gov/resource/nmvq-u3ir.json')

row_filter <-  colorado_fin_aid_data['year'] == '2007' & colorado_fin_aid_data['fafsaindicator'] == '1' & colorado_fin_aid_data['residency'] == 'In-State' & colorado_fin_aid_data['ethnicity'] == 'Hispanic' &
              colorado_fin_aid_data['agedesc'] == '17-20'  & colorado_fin_aid_data['gender'] == 'Female' &   
              colorado_fin_aid_data['enrollmentclassification'] == 'Full-time'


hispanic_co_resident_students <- colorado_fin_aid_data[row_filter, c('sumfederalpell', 'sumfederalloans')] 


hispanic_co_resident_students <- colorado_fin_aid_data[row_filter, c('sumfederalpell', 'sumfederalloans'), drop = FALSE] 

hispanic_co_resident_students[1, 1]
[1] "218310"
class(hispanic_co_resident_students[1, 1])
[1] "character"
hispanic_co_resident_students[1, 1, drop = FALSE]
    sumfederalpell
710         218310
class(hispanic_co_resident_students[1, 1, drop = FALSE])
[1] "data.frame"
# Column subsetting example

# Select the year, fafsaindicator, agedesc, gender, ethnicity, enrollment classification and recordcount

colorado_fin_aid_subset <- colorado_fin_aid_data[, c('year', 'fafsaindicator', 'agedesc', 'gender', 'ethnicity','enrollmentclassification', 'recordcount')]

head(colorado_fin_aid_subset)
  year fafsaindicator  agedesc gender                         ethnicity
1 2004              0    21-24 Female                          Hispanic
2 2004              1 Under 17 Female Native American or Alaskan Native
3 2004              1 Under 17 Female               White, non-Hispanic
4 2004              1 Under 17   Male                          Hispanic
5 2004              1 Under 17   Male               White, non-Hispanic
6 2004              1    17-20 Female         Asian or Pacific Islander
  enrollmentclassification recordcount
1                Full-time           2
2                Half-time           1
3      Less-than-half-time           1
4                Half-time           1
5                Half-time           1
6                Full-time          15
# Count of male age 21-24, Asian or Pacific Islander In-state in 2004

colorado_fin_aid_data[colorado_fin_aid_data$year== '2004' & colorado_fin_aid_data$residency == 'In-State' & colorado_fin_aid_data$agedesc == '21-24' & colorado_fin_aid_data$gender == 'Male' & colorado_fin_aid_data$ethnicity == 'Asian or Pacific Islander',] -> asian_colorado_res_2124

print(asian_colorado_res_2124)
    year fafsaindicator residency agedesc gender                 ethnicity
40  2004              1  In-State   21-24   Male Asian or Pacific Islander
41  2004              1  In-State   21-24   Male Asian or Pacific Islander
42  2004              1  In-State   21-24   Male Asian or Pacific Islander
157 2004              0  In-State   21-24   Male Asian or Pacific Islander
158 2004              0  In-State   21-24   Male Asian or Pacific Islander
    enrollmentclassification sumstateaid sumfederalpell sumfederalloans
40                 Full-time       23853          63246           18863
41                 Half-time        1125           8592            3313
42       Less-than-half-time           0            507               0
157                Half-time           0              0               0
158      Less-than-half-time           0              0               0
    sumotherfederal sumfederalplus sumotherloans sumotherscholarships
40             9415              0             0                 3048
41            10800              0             0                    0
42                0              0             0                    0
157               0              0             0                    0
158               0              0             0                    0
    recordcount
40           14
41           12
42            2
157           1
158           1
sum(as.numeric(asian_colorado_res_2124$recordcount))
[1] 30

dplyr subsetting and filtering functions

  • select(df, ...): extracts columns into a data frame
  • pull(df, ...): extracts column values into a vector
  • rename(df, ...): renames columns
  • filter(df, ...): subsets data by extracting rows that meet a logical expression
  • distict(df, ..., keep_all = FALSE):remove rows with duplicates
  • slice(df,...): select rows by position
library(readxl)

# Load CCIHE 2018 file

ccihe_2018 <- read_excel("Z:/R Training/CCIHE2018-PublicData.xlsx", sheet = 'Data')

# dplyr select verb------------------------------------------------------------
# Load tidyverse suite of packages
library(tidyverse)

# Select unit id, name, state, 2018 Basic Classifications and 2018 UG Profile
select(ccihe_2018, UNITID, NAME, STABBR, BASIC2018, UGPROFILE2018)
# A tibble: 4,324 x 5
   UNITID NAME                               STABBR BASIC2018 UGPROFILE2018
    <dbl> <chr>                              <chr>      <dbl>         <dbl>
 1 177834 A T Still University of Health Sc~ MO            25             0
 2 180203 Aaniiih Nakoda College             MT            33             3
 3 222178 Abilene Christian University       TX            18            12
 4 138558 Abraham Baldwin Agricultural Coll~ GA            23             7
 5 488031 Abraham Lincoln University         CA            31            10
 6 172866 Academy College                    MN            28             7
 7 451079 Academy for Five Element Acupunct~ FL            26             0
 8 457271 Academy for Jewish Religion-Calif~ CA            24             0
 9 412173 Academy for Nursing and Health Oc~ FL            10             3
10 108232 Academy of Art University          CA            18             5
# ... with 4,314 more rows
# Select UNITID and ACT Category through weighted ACT 25 percentile score
select(ccihe_2018, UNITID, ACTCAT:ACTFINAL)
# A tibble: 4,324 x 11
   UNITID ACTCAT  NSAT  NACT NSATACT SATV25 SATM25 SATCMB25 SATACTEQ25
    <dbl>  <dbl> <dbl> <dbl>   <dbl>  <dbl>  <dbl>    <dbl>      <dbl>
 1 177834      0     0     0       0      0      0        0          0
 2 180203      1     0     0       0      0      0        0          0
 3 222178      2   455   507     962    510    515     1025         19
 4 138558      1   526   430     956    470    450      920         17
 5 488031      1     0     0       0      0      0        0          0
 6 172866      1     0     0       0      0      0        0          0
 7 451079      0     0     0       0      0      0        0          0
 8 457271      0     0     0       0      0      0        0          0
 9 412173      1     0     0       0      0      0        0          0
10 108232      1     0     0       0      0      0        0          0
# ... with 4,314 more rows, and 2 more variables: ACTCMP25 <dbl>,
#   ACTFINAL <dbl>
# Extract all ids and put them in a vector
pull(ccihe_2018, UNITID) -> ids

head(ids)
[1] 177834 180203 222178 138558 488031 172866
# Rename STABBR to STATE and select UNITID STATE
select(rename(ccihe_2018, STATE = STABBR), UNITID, STATE)
# A tibble: 4,324 x 2
   UNITID STATE
    <dbl> <chr>
 1 177834 MO   
 2 180203 MT   
 3 222178 TX   
 4 138558 GA   
 5 488031 CA   
 6 172866 MN   
 7 451079 FL   
 8 457271 CA   
 9 412173 FL   
10 108232 CA   
# ... with 4,314 more rows
# dplyr filter verb------------------------------------------------------------

# Filter data for University of Denver and University of Colorado-Boulder
filter(ccihe_2018, UNITID %in% c(127060, 126614))
# A tibble: 2 x 97
  UNITID NAME  CITY  STABBR CC2000 BASIC2005 BASIC2010 BASIC2015 BASIC2018
   <dbl> <chr> <chr> <chr>   <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
1 126614 Univ~ Boul~ CO         15        15        15        15        15
2 127060 Univ~ Denv~ CO         15        16        16        16        16
# ... with 88 more variables: IPUG2018 <dbl>, IPGRAD2018 <dbl>,
#   ENRPROFILE2018 <dbl>, UGPROFILE2018 <dbl>, SIZESET2018 <dbl>,
#   CCE2015 <dbl>, OBEREG <dbl>, SECTOR <dbl>, ICLEVEL <dbl>,
#   CONTROL <dbl>, LOCALE <dbl>, LANDGRNT <dbl>, MEDICAL <dbl>,
#   HBCU <dbl>, TRIBAL <dbl>, HSI <dbl>, MSI <dbl>, WOMENS <dbl>,
#   COPLAC <dbl>, CUSU <dbl>, CUMU <dbl>, ASSOCDEG <dbl>, BACCDEG <dbl>,
#   MASTDEG <dbl>, DOCRSDEG <dbl>, DOCPPDEG <dbl>, DOCOTHDEG <dbl>,
#   TOTDEG <dbl>, `S&ER&D` <dbl>, `NONS&ER&D` <dbl>, PDNFRSTAFF <dbl>,
#   FACNUM <dbl>, HUM_RSD <dbl>, SOCSC_RSD <dbl>, STEM_RSD <dbl>,
#   OTHER_RSD <dbl>, `DRSA&S` <dbl>, DRSPROF <dbl>, `OGRDA&S` <dbl>,
#   OGRDPROF <dbl>, `A&SBADEG` <dbl>, PROFBADEG <dbl>, ASC1C2TRNS <dbl>,
#   ASC1C2CRTC <dbl>, FALLENR16 <dbl>, ANENR1617 <dbl>, FALLENR17 <dbl>,
#   FALLFTE17 <dbl>, UGTENR17 <dbl>, GRTENR17 <dbl>, UGDSFTF17 <dbl>,
#   UGDSPTF17 <dbl>, UGNDFT17 <dbl>, UGNDPT17 <dbl>, GRFTF17 <dbl>,
#   GRPTF17 <dbl>, UGN1STTMFT17 <dbl>, UGN1STTMPT17 <dbl>,
#   UGNTRFT17 <dbl>, UGNTRPT17 <dbl>, FAITHFLAG <dbl>, OTHSFFLAG <dbl>,
#   NUMCIP2 <dbl>, LRGSTCIP2 <dbl>, PCTLRGST <dbl>, UGCIP4PR <dbl>,
#   GRCIP4PR <dbl>, COEXPR <dbl>, PCTCOEX <dbl>, DOCRESFLAG <dbl>,
#   MAXGPEDUC <dbl>, MAXGPBUS <dbl>, MAXGPOTH <dbl>, NGCIP2PXDR <dbl>,
#   NGCIP2DR <dbl>, ROOMS <dbl>, ACTCAT <dbl>, NSAT <dbl>, NACT <dbl>,
#   NSATACT <dbl>, SATV25 <dbl>, SATM25 <dbl>, SATCMB25 <dbl>,
#   SATACTEQ25 <dbl>, ACTCMP25 <dbl>, ACTFINAL <dbl>, ...96 <lgl>,
#   ...97 <dbl>
# Find all doctoral granting institutions with a SAT MATH 75th percentile score higher than 600
 
select(filter(ccihe_2018, BASIC2018 %in% c(15, 16, 17) & SATM25 >= 600), UNITID, NAME, SATM25)
# A tibble: 72 x 3
   UNITID NAME                               SATM25
    <dbl> <chr>                               <dbl>
 1 196079 Binghamton University                 650
 2 164924 Boston College                        660
 3 164988 Boston University                     660
 4 165015 Brandeis University                   650
 5 230038 Brigham Young University-Provo        600
 6 217156 Brown University                      700
 7 110404 California Institute of Technology    780
 8 211440 Carnegie Mellon University            730
 9 201645 Case Western Reserve University       690
10 217882 Clemson University                    600
# ... with 62 more rows