I am going to install my packages frfr.
#install.packages("tidyverse")
#install.packages("lubridate")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
hbcu <- read_csv("https://raw.githubusercontent.com/quant-shop/intro-comp-educ-soc/refs/heads/main/data/hbcu_data.csv")
## Rows: 102 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, city, state, type
## dbl (3): founded, lat, lon
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows
head(hbcu) #head shows you first six rows
## # A tibble: 6 × 7
## name city state founded lat lon type
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 Alabama A&M University Normal AL 1875 34.8 -86.6 Public, 4 Year
## 2 Alabama State University Montgomery AL 1867 32.4 -86.3 Public, 4 Year
## 3 Albany State University Albany GA 1903 31.6 -84.2 Public, 4 Year
## 4 Alcorn State University Lorman MS 1871 31.9 -91.1 Public, 4 Year
## 5 Allen University Columbia SC 1870 34.0 -81.0 Private, 4 Year
## 6 American Baptist College Nashville TN 1924 36.2 -86.8 Private, 4 Year
tail(hbcu) #tail shows you last six rows
## # A tibble: 6 × 7
## name city state founded lat lon type
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 Voorhees University Denmark SC 1962 33.3 -81.1 Privat…
## 2 West Virginia State University Institute WV 1865 38.4 -81.8 Public…
## 3 Wilberforce University Wilberforce OH 1856 39.7 -83.9 Privat…
## 4 Wiley College Marshall TX 1873 32.5 -94.4 Privat…
## 5 Winston-Salem State University Winston-Salem NC 1892 36.1 -80.2 Public…
## 6 Xavier University of Louisiana New Orleans LA 1915 30.0 -90.1 Privat…
head(hbcu, n=10) #this will show me the first ten rows
## # A tibble: 10 × 7
## name city state founded lat lon type
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 Alabama A&M University Normal AL 1875 34.8 -86.6 Public, 4…
## 2 Alabama State University Montgomery AL 1867 32.4 -86.3 Public, 4…
## 3 Albany State University Albany GA 1903 31.6 -84.2 Public, 4…
## 4 Alcorn State University Lorman MS 1871 31.9 -91.1 Public, 4…
## 5 Allen University Columbia SC 1870 34.0 -81.0 Private, …
## 6 American Baptist College Nashville TN 1924 36.2 -86.8 Private, …
## 7 Arkansas Baptist College Little Rock AR 1884 34.7 -92.3 Private, …
## 8 Benedict College Columbia SC 1870 34.0 -81.0 Private, …
## 9 Bennett College Greensboro NC 1873 36.1 -79.8 Private, …
## 10 Bethune-Cookman University Daytona Beach FL 1904 29.2 -81.0 Private, …
tail(hbcu, n=20) #this will show me the last twenty row
## # A tibble: 20 × 7
## name city state founded lat lon type
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 St. Philip's College San A… TX 1898 29.4 -98.5 Publ…
## 2 Stillman College Tusca… AL 1876 33.2 -87.6 Priv…
## 3 Talladega College Talla… AL 1867 33.5 -86.1 Priv…
## 4 Tennessee State University Nashv… TN 1912 36.2 -86.8 Publ…
## 5 Texas College Tyler TX 1894 32.4 -95.3 Priv…
## 6 Texas Southern University Houst… TX 1947 29.7 -95.4 Publ…
## 7 Tougaloo College Touga… MS 1871 32.4 -90.5 Priv…
## 8 Tuskegee University Tuske… AL 1867 32.4 -85.7 Priv…
## 9 University of Arkansas at Pine Bluff Pine … AR 1873 34.2 -92.0 Publ…
## 10 University of Maryland Eastern Shore Princ… MD 1886 38.2 -75.7 Publ…
## 11 University of the District of Columbia Washi… DC 1962 38.9 -77.0 Publ…
## 12 University of the Virgin Islands St. T… USVI 1882 18.3 -65.0 Publ…
## 13 Virginia State University Peter… VA 1865 37.2 -77.4 Publ…
## 14 Virginia Union University Richm… VA 1886 37.5 -77.5 Priv…
## 15 Voorhees University Denma… SC 1962 33.3 -81.1 Priv…
## 16 West Virginia State University Insti… WV 1865 38.4 -81.8 Publ…
## 17 Wilberforce University Wilbe… OH 1856 39.7 -83.9 Priv…
## 18 Wiley College Marsh… TX 1873 32.5 -94.4 Priv…
## 19 Winston-Salem State University Winst… NC 1892 36.1 -80.2 Publ…
## 20 Xavier University of Louisiana New O… LA 1915 30.0 -90.1 Priv…
Get a glimpse of the data
glimpse(hbcu) #the glimpse function shows me a sneak peak of the data set
## Rows: 102
## Columns: 7
## $ name <chr> "Alabama A&M University", "Alabama State University", "Albany …
## $ city <chr> "Normal", "Montgomery", "Albany", "Lorman", "Columbia", "Nashv…
## $ state <chr> "AL", "AL", "GA", "MS", "SC", "TN", "AR", "SC", "NC", "FL", "A…
## $ founded <dbl> 1875, 1867, 1903, 1871, 1870, 1924, 1884, 1870, 1873, 1904, 19…
## $ lat <dbl> 34.7834, 32.3643, 31.5785, 31.8769, 34.0298, 36.1659, 34.7465,…
## $ lon <dbl> -86.5683, -86.2952, -84.1543, -91.1458, -81.0115, -86.7844, -9…
## $ type <chr> "Public, 4 Year", "Public, 4 Year", "Public, 4 Year", "Public,…
I want to get a summary of the dataaa
# Summary statistics
summary(hbcu)
## name city state founded
## Length:102 Length:102 Length:102 Min. :1837
## Class :character Class :character Class :character 1st Qu.:1870
## Mode :character Mode :character Mode :character Median :1886
## Mean :1895
## 3rd Qu.:1905
## Max. :1988
## lat lon type
## Min. :18.34 Min. :-98.50 Length:102
## 1st Qu.:32.48 1st Qu.:-90.13 Class :character
## Median :34.02 Median :-84.64 Mode :character
## Mean :34.31 Mean :-85.13
## 3rd Qu.:36.17 3rd Qu.:-80.78
## Max. :39.93 Max. :-64.96
hbcu %>%
filter(founded == 1988)
## # A tibble: 1 × 7
## name city state founded lat lon type
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 Clark Atlanta University Atlanta GA 1988 33.7 -84.4 Private, 4 Year
str(hbcu) #this is going to give me the structure of the df/vas
## spc_tbl_ [102 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ name : chr [1:102] "Alabama A&M University" "Alabama State University" "Albany State University" "Alcorn State University" ...
## $ city : chr [1:102] "Normal" "Montgomery" "Albany" "Lorman" ...
## $ state : chr [1:102] "AL" "AL" "GA" "MS" ...
## $ founded: num [1:102] 1875 1867 1903 1871 1870 ...
## $ lat : num [1:102] 34.8 32.4 31.6 31.9 34 ...
## $ lon : num [1:102] -86.6 -86.3 -84.2 -91.1 -81 ...
## $ type : chr [1:102] "Public, 4 Year" "Public, 4 Year" "Public, 4 Year" "Public, 4 Year" ...
## - attr(*, "spec")=
## .. cols(
## .. name = col_character(),
## .. city = col_character(),
## .. state = col_character(),
## .. founded = col_double(),
## .. lat = col_double(),
## .. lon = col_double(),
## .. type = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Clean data to make it easier for analysis and more importanly I need to clean data for plotting and mapping
hbcu2 <- hbcu %>%
mutate(
founded = as.numeric(founded),
type = as.factor(type),
state = as.factor(state)
)
hbcu2
## # A tibble: 102 × 7
## name city state founded lat lon type
## <chr> <chr> <fct> <dbl> <dbl> <dbl> <fct>
## 1 Alabama A&M University Normal AL 1875 34.8 -86.6 Public, 4…
## 2 Alabama State University Montgomery AL 1867 32.4 -86.3 Public, 4…
## 3 Albany State University Albany GA 1903 31.6 -84.2 Public, 4…
## 4 Alcorn State University Lorman MS 1871 31.9 -91.1 Public, 4…
## 5 Allen University Columbia SC 1870 34.0 -81.0 Private, …
## 6 American Baptist College Nashville TN 1924 36.2 -86.8 Private, …
## 7 Arkansas Baptist College Little Rock AR 1884 34.7 -92.3 Private, …
## 8 Benedict College Columbia SC 1870 34.0 -81.0 Private, …
## 9 Bennett College Greensboro NC 1873 36.1 -79.8 Private, …
## 10 Bethune-Cookman University Daytona Beach FL 1904 29.2 -81.0 Private, …
## # ℹ 92 more rows
What is the disrtruption by HBCU type
hbcu2 %>% #take hbcu 2 and then
count(type) %>% #count type and then
arrange(desc(n)) #arrange in descending order by count
## # A tibble: 5 × 2
## type n
## <fct> <int>
## 1 Private, 4 Year 45
## 2 Public, 4 Year 40
## 3 Public, 2 Year 11
## 4 Private, Specialized 4
## 5 Private, 2 Year 2
hbcu %>%
ggplot(aes(x = founded)) +
geom_histogram(binwidth = 10, fill = "steelblue", color = "white") +
labs(
title = "Founding Years of HBCUs",
x = "Year Founded",
y = "Number of Institutions"
)
hbcu %>%
count(state, sort = TRUE)
## # A tibble: 21 × 2
## state n
## <chr> <int>
## 1 AL 14
## 2 GA 10
## 3 NC 10
## 4 TX 9
## 5 SC 8
## 6 MS 7
## 7 LA 6
## 8 TN 6
## 9 AR 4
## 10 FL 4
## # ℹ 11 more rows
hbcu2 %>%
count(state, sort = TRUE)
## # A tibble: 21 × 2
## state n
## <fct> <int>
## 1 AL 14
## 2 GA 10
## 3 NC 10
## 4 TX 9
## 5 SC 8
## 6 MS 7
## 7 LA 6
## 8 TN 6
## 9 AR 4
## 10 FL 4
## # ℹ 11 more rows
hbcu2 %>%
count(state) %>%
ggplot(aes(x = reorder(state, n), y = n)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Number of HBCUs by State",
x = "State",
y = "Count"
)
#install.packages("maps", repos = "http://cran.us.r-project.org")
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
hbcu2 %>%
ggplot(aes(x = lon, y = lat)) +
borders("state") +
geom_point(color = "red", alpha = 0.6) +
coord_fixed(1.3) +
labs(
title = "Geographic Distribution of HBCUs in the U.S.",
x = "Longitude",
y = "Latitude"
)