data.frame
. Extremely fast and memory efficient.<
, <=
, >
, >=
, ==
, %in%
, &
, ect.
$
operatordf[row_index, col_index]
df[, df$var]
df[row_sub, col_sub, drop = FALSE]
df
: The name of the data.framerow_index
: integer of the rows to subsetcol_index
: integer of the columns to subsetrow_sub
: vector specifying the subset of rowscol_sub
: vector specyfing the subset of columnsdrop = FALSE
: Preserves the structure# Row subsetting example
# In 2007, find the sum of all federal Pell grants and federal loans for full-time, female, age 17-20, Hispanic Colorado residents
library(jsonlite)
colorado_fin_aid_data<- fromJSON('https://data.colorado.gov/resource/nmvq-u3ir.json')
row_filter <- colorado_fin_aid_data['year'] == '2007' & colorado_fin_aid_data['fafsaindicator'] == '1' & colorado_fin_aid_data['residency'] == 'In-State' & colorado_fin_aid_data['ethnicity'] == 'Hispanic' &
colorado_fin_aid_data['agedesc'] == '17-20' & colorado_fin_aid_data['gender'] == 'Female' &
colorado_fin_aid_data['enrollmentclassification'] == 'Full-time'
hispanic_co_resident_students <- colorado_fin_aid_data[row_filter, c('sumfederalpell', 'sumfederalloans')]
hispanic_co_resident_students <- colorado_fin_aid_data[row_filter, c('sumfederalpell', 'sumfederalloans'), drop = FALSE]
hispanic_co_resident_students[1, 1]
[1] "218310"
class(hispanic_co_resident_students[1, 1])
[1] "character"
hispanic_co_resident_students[1, 1, drop = FALSE]
sumfederalpell
710 218310
class(hispanic_co_resident_students[1, 1, drop = FALSE])
[1] "data.frame"
# Column subsetting example
# Select the year, fafsaindicator, agedesc, gender, ethnicity, enrollment classification and recordcount
colorado_fin_aid_subset <- colorado_fin_aid_data[, c('year', 'fafsaindicator', 'agedesc', 'gender', 'ethnicity','enrollmentclassification', 'recordcount')]
head(colorado_fin_aid_subset)
year fafsaindicator agedesc gender ethnicity
1 2004 0 21-24 Female Hispanic
2 2004 1 Under 17 Female Native American or Alaskan Native
3 2004 1 Under 17 Female White, non-Hispanic
4 2004 1 Under 17 Male Hispanic
5 2004 1 Under 17 Male White, non-Hispanic
6 2004 1 17-20 Female Asian or Pacific Islander
enrollmentclassification recordcount
1 Full-time 2
2 Half-time 1
3 Less-than-half-time 1
4 Half-time 1
5 Half-time 1
6 Full-time 15
# Count of male age 21-24, Asian or Pacific Islander In-state in 2004
colorado_fin_aid_data[colorado_fin_aid_data$year== '2004' & colorado_fin_aid_data$residency == 'In-State' & colorado_fin_aid_data$agedesc == '21-24' & colorado_fin_aid_data$gender == 'Male' & colorado_fin_aid_data$ethnicity == 'Asian or Pacific Islander',] -> asian_colorado_res_2124
print(asian_colorado_res_2124)
year fafsaindicator residency agedesc gender ethnicity
40 2004 1 In-State 21-24 Male Asian or Pacific Islander
41 2004 1 In-State 21-24 Male Asian or Pacific Islander
42 2004 1 In-State 21-24 Male Asian or Pacific Islander
157 2004 0 In-State 21-24 Male Asian or Pacific Islander
158 2004 0 In-State 21-24 Male Asian or Pacific Islander
enrollmentclassification sumstateaid sumfederalpell sumfederalloans
40 Full-time 23853 63246 18863
41 Half-time 1125 8592 3313
42 Less-than-half-time 0 507 0
157 Half-time 0 0 0
158 Less-than-half-time 0 0 0
sumotherfederal sumfederalplus sumotherloans sumotherscholarships
40 9415 0 0 3048
41 10800 0 0 0
42 0 0 0 0
157 0 0 0 0
158 0 0 0 0
recordcount
40 14
41 12
42 2
157 1
158 1
sum(as.numeric(asian_colorado_res_2124$recordcount))
[1] 30
select(df, ...)
: extracts columns into a data framepull(df, ...)
: extracts column values into a vectorrename(df, ...)
: renames columnsfilter(df, ...)
: subsets data by extracting rows that meet a logical expressiondistict(df, ..., keep_all = FALSE)
:remove rows with duplicatesslice(df,...)
: select rows by positionlibrary(readxl)
# Load CCIHE 2018 file
ccihe_2018 <- read_excel("Z:/R Training/CCIHE2018-PublicData.xlsx", sheet = 'Data')
# dplyr select verb------------------------------------------------------------
# Load tidyverse suite of packages
library(tidyverse)
# Select unit id, name, state, 2018 Basic Classifications and 2018 UG Profile
select(ccihe_2018, UNITID, NAME, STABBR, BASIC2018, UGPROFILE2018)
# A tibble: 4,324 x 5
UNITID NAME STABBR BASIC2018 UGPROFILE2018
<dbl> <chr> <chr> <dbl> <dbl>
1 177834 A T Still University of Health Sc~ MO 25 0
2 180203 Aaniiih Nakoda College MT 33 3
3 222178 Abilene Christian University TX 18 12
4 138558 Abraham Baldwin Agricultural Coll~ GA 23 7
5 488031 Abraham Lincoln University CA 31 10
6 172866 Academy College MN 28 7
7 451079 Academy for Five Element Acupunct~ FL 26 0
8 457271 Academy for Jewish Religion-Calif~ CA 24 0
9 412173 Academy for Nursing and Health Oc~ FL 10 3
10 108232 Academy of Art University CA 18 5
# ... with 4,314 more rows
# Select UNITID and ACT Category through weighted ACT 25 percentile score
select(ccihe_2018, UNITID, ACTCAT:ACTFINAL)
# A tibble: 4,324 x 11
UNITID ACTCAT NSAT NACT NSATACT SATV25 SATM25 SATCMB25 SATACTEQ25
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 177834 0 0 0 0 0 0 0 0
2 180203 1 0 0 0 0 0 0 0
3 222178 2 455 507 962 510 515 1025 19
4 138558 1 526 430 956 470 450 920 17
5 488031 1 0 0 0 0 0 0 0
6 172866 1 0 0 0 0 0 0 0
7 451079 0 0 0 0 0 0 0 0
8 457271 0 0 0 0 0 0 0 0
9 412173 1 0 0 0 0 0 0 0
10 108232 1 0 0 0 0 0 0 0
# ... with 4,314 more rows, and 2 more variables: ACTCMP25 <dbl>,
# ACTFINAL <dbl>
# Extract all ids and put them in a vector
pull(ccihe_2018, UNITID) -> ids
head(ids)
[1] 177834 180203 222178 138558 488031 172866
# Rename STABBR to STATE and select UNITID STATE
select(rename(ccihe_2018, STATE = STABBR), UNITID, STATE)
# A tibble: 4,324 x 2
UNITID STATE
<dbl> <chr>
1 177834 MO
2 180203 MT
3 222178 TX
4 138558 GA
5 488031 CA
6 172866 MN
7 451079 FL
8 457271 CA
9 412173 FL
10 108232 CA
# ... with 4,314 more rows
# dplyr filter verb------------------------------------------------------------
# Filter data for University of Denver and University of Colorado-Boulder
filter(ccihe_2018, UNITID %in% c(127060, 126614))
# A tibble: 2 x 97
UNITID NAME CITY STABBR CC2000 BASIC2005 BASIC2010 BASIC2015 BASIC2018
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 126614 Univ~ Boul~ CO 15 15 15 15 15
2 127060 Univ~ Denv~ CO 15 16 16 16 16
# ... with 88 more variables: IPUG2018 <dbl>, IPGRAD2018 <dbl>,
# ENRPROFILE2018 <dbl>, UGPROFILE2018 <dbl>, SIZESET2018 <dbl>,
# CCE2015 <dbl>, OBEREG <dbl>, SECTOR <dbl>, ICLEVEL <dbl>,
# CONTROL <dbl>, LOCALE <dbl>, LANDGRNT <dbl>, MEDICAL <dbl>,
# HBCU <dbl>, TRIBAL <dbl>, HSI <dbl>, MSI <dbl>, WOMENS <dbl>,
# COPLAC <dbl>, CUSU <dbl>, CUMU <dbl>, ASSOCDEG <dbl>, BACCDEG <dbl>,
# MASTDEG <dbl>, DOCRSDEG <dbl>, DOCPPDEG <dbl>, DOCOTHDEG <dbl>,
# TOTDEG <dbl>, `S&ER&D` <dbl>, `NONS&ER&D` <dbl>, PDNFRSTAFF <dbl>,
# FACNUM <dbl>, HUM_RSD <dbl>, SOCSC_RSD <dbl>, STEM_RSD <dbl>,
# OTHER_RSD <dbl>, `DRSA&S` <dbl>, DRSPROF <dbl>, `OGRDA&S` <dbl>,
# OGRDPROF <dbl>, `A&SBADEG` <dbl>, PROFBADEG <dbl>, ASC1C2TRNS <dbl>,
# ASC1C2CRTC <dbl>, FALLENR16 <dbl>, ANENR1617 <dbl>, FALLENR17 <dbl>,
# FALLFTE17 <dbl>, UGTENR17 <dbl>, GRTENR17 <dbl>, UGDSFTF17 <dbl>,
# UGDSPTF17 <dbl>, UGNDFT17 <dbl>, UGNDPT17 <dbl>, GRFTF17 <dbl>,
# GRPTF17 <dbl>, UGN1STTMFT17 <dbl>, UGN1STTMPT17 <dbl>,
# UGNTRFT17 <dbl>, UGNTRPT17 <dbl>, FAITHFLAG <dbl>, OTHSFFLAG <dbl>,
# NUMCIP2 <dbl>, LRGSTCIP2 <dbl>, PCTLRGST <dbl>, UGCIP4PR <dbl>,
# GRCIP4PR <dbl>, COEXPR <dbl>, PCTCOEX <dbl>, DOCRESFLAG <dbl>,
# MAXGPEDUC <dbl>, MAXGPBUS <dbl>, MAXGPOTH <dbl>, NGCIP2PXDR <dbl>,
# NGCIP2DR <dbl>, ROOMS <dbl>, ACTCAT <dbl>, NSAT <dbl>, NACT <dbl>,
# NSATACT <dbl>, SATV25 <dbl>, SATM25 <dbl>, SATCMB25 <dbl>,
# SATACTEQ25 <dbl>, ACTCMP25 <dbl>, ACTFINAL <dbl>, ...96 <lgl>,
# ...97 <dbl>
# Find all doctoral granting institutions with a SAT MATH 75th percentile score higher than 600
select(filter(ccihe_2018, BASIC2018 %in% c(15, 16, 17) & SATM25 >= 600), UNITID, NAME, SATM25)
# A tibble: 72 x 3
UNITID NAME SATM25
<dbl> <chr> <dbl>
1 196079 Binghamton University 650
2 164924 Boston College 660
3 164988 Boston University 660
4 165015 Brandeis University 650
5 230038 Brigham Young University-Provo 600
6 217156 Brown University 700
7 110404 California Institute of Technology 780
8 211440 Carnegie Mellon University 730
9 201645 Case Western Reserve University 690
10 217882 Clemson University 600
# ... with 62 more rows