Doing a literature review

Case Study on HBCUs

I am going to install my packages frfr.

#install.packages("tidyverse")
#install.packages("lubridate")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
hbcu <- read_csv("https://raw.githubusercontent.com/quant-shop/intro-comp-educ-soc/refs/heads/main/data/hbcu_data.csv")
## Rows: 102 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, city, state, type
## dbl (3): founded, lat, lon
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows
head(hbcu) #head shows you first six rows
## # A tibble: 6 × 7
##   name                     city       state founded   lat   lon type           
##   <chr>                    <chr>      <chr>   <dbl> <dbl> <dbl> <chr>          
## 1 Alabama A&M University   Normal     AL       1875  34.8 -86.6 Public, 4 Year 
## 2 Alabama State University Montgomery AL       1867  32.4 -86.3 Public, 4 Year 
## 3 Albany State University  Albany     GA       1903  31.6 -84.2 Public, 4 Year 
## 4 Alcorn State University  Lorman     MS       1871  31.9 -91.1 Public, 4 Year 
## 5 Allen University         Columbia   SC       1870  34.0 -81.0 Private, 4 Year
## 6 American Baptist College Nashville  TN       1924  36.2 -86.8 Private, 4 Year
tail(hbcu) #tail shows you last six rows
## # A tibble: 6 × 7
##   name                           city          state founded   lat   lon type   
##   <chr>                          <chr>         <chr>   <dbl> <dbl> <dbl> <chr>  
## 1 Voorhees University            Denmark       SC       1962  33.3 -81.1 Privat…
## 2 West Virginia State University Institute     WV       1865  38.4 -81.8 Public…
## 3 Wilberforce University         Wilberforce   OH       1856  39.7 -83.9 Privat…
## 4 Wiley College                  Marshall      TX       1873  32.5 -94.4 Privat…
## 5 Winston-Salem State University Winston-Salem NC       1892  36.1 -80.2 Public…
## 6 Xavier University of Louisiana New Orleans   LA       1915  30.0 -90.1 Privat…
head(hbcu, n=10) #this will show me the first ten rows
## # A tibble: 10 × 7
##    name                       city          state founded   lat   lon type      
##    <chr>                      <chr>         <chr>   <dbl> <dbl> <dbl> <chr>     
##  1 Alabama A&M University     Normal        AL       1875  34.8 -86.6 Public, 4…
##  2 Alabama State University   Montgomery    AL       1867  32.4 -86.3 Public, 4…
##  3 Albany State University    Albany        GA       1903  31.6 -84.2 Public, 4…
##  4 Alcorn State University    Lorman        MS       1871  31.9 -91.1 Public, 4…
##  5 Allen University           Columbia      SC       1870  34.0 -81.0 Private, …
##  6 American Baptist College   Nashville     TN       1924  36.2 -86.8 Private, …
##  7 Arkansas Baptist College   Little Rock   AR       1884  34.7 -92.3 Private, …
##  8 Benedict College           Columbia      SC       1870  34.0 -81.0 Private, …
##  9 Bennett College            Greensboro    NC       1873  36.1 -79.8 Private, …
## 10 Bethune-Cookman University Daytona Beach FL       1904  29.2 -81.0 Private, …
tail(hbcu, n=20) #this will show me the last twenty row
## # A tibble: 20 × 7
##    name                                   city   state founded   lat   lon type 
##    <chr>                                  <chr>  <chr>   <dbl> <dbl> <dbl> <chr>
##  1 St. Philip's College                   San A… TX       1898  29.4 -98.5 Publ…
##  2 Stillman College                       Tusca… AL       1876  33.2 -87.6 Priv…
##  3 Talladega College                      Talla… AL       1867  33.5 -86.1 Priv…
##  4 Tennessee State University             Nashv… TN       1912  36.2 -86.8 Publ…
##  5 Texas College                          Tyler  TX       1894  32.4 -95.3 Priv…
##  6 Texas Southern University              Houst… TX       1947  29.7 -95.4 Publ…
##  7 Tougaloo College                       Touga… MS       1871  32.4 -90.5 Priv…
##  8 Tuskegee University                    Tuske… AL       1867  32.4 -85.7 Priv…
##  9 University of Arkansas at Pine Bluff   Pine … AR       1873  34.2 -92.0 Publ…
## 10 University of Maryland Eastern Shore   Princ… MD       1886  38.2 -75.7 Publ…
## 11 University of the District of Columbia Washi… DC       1962  38.9 -77.0 Publ…
## 12 University of the Virgin Islands       St. T… USVI     1882  18.3 -65.0 Publ…
## 13 Virginia State University              Peter… VA       1865  37.2 -77.4 Publ…
## 14 Virginia Union University              Richm… VA       1886  37.5 -77.5 Priv…
## 15 Voorhees University                    Denma… SC       1962  33.3 -81.1 Priv…
## 16 West Virginia State University         Insti… WV       1865  38.4 -81.8 Publ…
## 17 Wilberforce University                 Wilbe… OH       1856  39.7 -83.9 Priv…
## 18 Wiley College                          Marsh… TX       1873  32.5 -94.4 Priv…
## 19 Winston-Salem State University         Winst… NC       1892  36.1 -80.2 Publ…
## 20 Xavier University of Louisiana         New O… LA       1915  30.0 -90.1 Priv…

Get a glimpse of the data

glimpse(hbcu) #the glimpse function shows me a sneak peak of the data set
## Rows: 102
## Columns: 7
## $ name    <chr> "Alabama A&M University", "Alabama State University", "Albany …
## $ city    <chr> "Normal", "Montgomery", "Albany", "Lorman", "Columbia", "Nashv…
## $ state   <chr> "AL", "AL", "GA", "MS", "SC", "TN", "AR", "SC", "NC", "FL", "A…
## $ founded <dbl> 1875, 1867, 1903, 1871, 1870, 1924, 1884, 1870, 1873, 1904, 19…
## $ lat     <dbl> 34.7834, 32.3643, 31.5785, 31.8769, 34.0298, 36.1659, 34.7465,…
## $ lon     <dbl> -86.5683, -86.2952, -84.1543, -91.1458, -81.0115, -86.7844, -9…
## $ type    <chr> "Public, 4 Year", "Public, 4 Year", "Public, 4 Year", "Public,…

I want to get a summary of the dataaa

# Summary statistics
summary(hbcu)
##      name               city              state              founded    
##  Length:102         Length:102         Length:102         Min.   :1837  
##  Class :character   Class :character   Class :character   1st Qu.:1870  
##  Mode  :character   Mode  :character   Mode  :character   Median :1886  
##                                                           Mean   :1895  
##                                                           3rd Qu.:1905  
##                                                           Max.   :1988  
##       lat             lon             type          
##  Min.   :18.34   Min.   :-98.50   Length:102        
##  1st Qu.:32.48   1st Qu.:-90.13   Class :character  
##  Median :34.02   Median :-84.64   Mode  :character  
##  Mean   :34.31   Mean   :-85.13                     
##  3rd Qu.:36.17   3rd Qu.:-80.78                     
##  Max.   :39.93   Max.   :-64.96
hbcu %>% 
  filter(founded == 1988)
## # A tibble: 1 × 7
##   name                     city    state founded   lat   lon type           
##   <chr>                    <chr>   <chr>   <dbl> <dbl> <dbl> <chr>          
## 1 Clark Atlanta University Atlanta GA       1988  33.7 -84.4 Private, 4 Year
str(hbcu) #this is going to give me the structure of the df/vas
## spc_tbl_ [102 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ name   : chr [1:102] "Alabama A&M University" "Alabama State University" "Albany State University" "Alcorn State University" ...
##  $ city   : chr [1:102] "Normal" "Montgomery" "Albany" "Lorman" ...
##  $ state  : chr [1:102] "AL" "AL" "GA" "MS" ...
##  $ founded: num [1:102] 1875 1867 1903 1871 1870 ...
##  $ lat    : num [1:102] 34.8 32.4 31.6 31.9 34 ...
##  $ lon    : num [1:102] -86.6 -86.3 -84.2 -91.1 -81 ...
##  $ type   : chr [1:102] "Public, 4 Year" "Public, 4 Year" "Public, 4 Year" "Public, 4 Year" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   name = col_character(),
##   ..   city = col_character(),
##   ..   state = col_character(),
##   ..   founded = col_double(),
##   ..   lat = col_double(),
##   ..   lon = col_double(),
##   ..   type = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Clean data to make it easier for analysis and more importanly I need to clean data for plotting and mapping

hbcu2 <- hbcu %>%
  mutate(
    founded = as.numeric(founded),
    type = as.factor(type),
    state = as.factor(state)
  )
hbcu2
## # A tibble: 102 × 7
##    name                       city          state founded   lat   lon type      
##    <chr>                      <chr>         <fct>   <dbl> <dbl> <dbl> <fct>     
##  1 Alabama A&M University     Normal        AL       1875  34.8 -86.6 Public, 4…
##  2 Alabama State University   Montgomery    AL       1867  32.4 -86.3 Public, 4…
##  3 Albany State University    Albany        GA       1903  31.6 -84.2 Public, 4…
##  4 Alcorn State University    Lorman        MS       1871  31.9 -91.1 Public, 4…
##  5 Allen University           Columbia      SC       1870  34.0 -81.0 Private, …
##  6 American Baptist College   Nashville     TN       1924  36.2 -86.8 Private, …
##  7 Arkansas Baptist College   Little Rock   AR       1884  34.7 -92.3 Private, …
##  8 Benedict College           Columbia      SC       1870  34.0 -81.0 Private, …
##  9 Bennett College            Greensboro    NC       1873  36.1 -79.8 Private, …
## 10 Bethune-Cookman University Daytona Beach FL       1904  29.2 -81.0 Private, …
## # ℹ 92 more rows

What is the disrtruption by HBCU type

hbcu2 %>% #take hbcu 2 and then
  count(type) %>%  #count type and then
  arrange(desc(n)) #arrange in descending order by count
## # A tibble: 5 × 2
##   type                     n
##   <fct>                <int>
## 1 Private, 4 Year         45
## 2 Public, 4 Year          40
## 3 Public, 2 Year          11
## 4 Private, Specialized     4
## 5 Private, 2 Year          2
hbcu %>%
  ggplot(aes(x = founded)) +
  geom_histogram(binwidth = 10, fill = "steelblue", color = "white") +
  labs(
    title = "Founding Years of HBCUs",
    x = "Year Founded",
    y = "Number of Institutions"
  )

hbcu %>%
  count(state, sort = TRUE)
## # A tibble: 21 × 2
##    state     n
##    <chr> <int>
##  1 AL       14
##  2 GA       10
##  3 NC       10
##  4 TX        9
##  5 SC        8
##  6 MS        7
##  7 LA        6
##  8 TN        6
##  9 AR        4
## 10 FL        4
## # ℹ 11 more rows
hbcu2 %>%
  count(state, sort = TRUE)
## # A tibble: 21 × 2
##    state     n
##    <fct> <int>
##  1 AL       14
##  2 GA       10
##  3 NC       10
##  4 TX        9
##  5 SC        8
##  6 MS        7
##  7 LA        6
##  8 TN        6
##  9 AR        4
## 10 FL        4
## # ℹ 11 more rows
hbcu2 %>%
  count(state) %>%
  ggplot(aes(x = reorder(state, n), y = n)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Number of HBCUs by State",
    x = "State",
    y = "Count"
  )

#install.packages("maps", repos = "http://cran.us.r-project.org")
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
hbcu2 %>%
  ggplot(aes(x = lon, y = lat)) +
  borders("state") +
  geom_point(color = "red", alpha = 0.6) +
  coord_fixed(1.3) +
  labs(
    title = "Geographic Distribution of HBCUs in the U.S.",
    x = "Longitude",
    y = "Latitude"
  )