1.load the data
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
AU <- read_csv("hd2016.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## INSTNM = col_character(),
## IALIAS = col_character(),
## ADDR = col_character(),
## CITY = col_character(),
## STABBR = col_character(),
## ZIP = col_character(),
## CHFNM = col_character(),
## CHFTITLE = col_character(),
## GENTELE = col_double(),
## EIN = col_character(),
## DUNS = col_character(),
## OPEID = col_character(),
## WEBADDR = col_character(),
## ADMINURL = col_character(),
## FAIDURL = col_character(),
## APPLURL = col_character(),
## NPRICURL = col_character(),
## VETURL = col_character(),
## ATHURL = col_character(),
## DISAURL = col_character()
## # ... with 6 more columns
## )
## See spec(...) for full column specifications.
## Warning: 2 parsing failures.
## row # A tibble: 2 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 68 CHFTITLE delimiter or quote A 'hd2016.csv' file 2 68 CHFTITLE delimiter or quote 'hd2016.csv'
head(AU)
## # A tibble: 6 x 70
## UNITID INSTNM
## <int> <chr>
## 1 100654 Alabama A & M University
## 2 100663 University of Alabama at Birmingham
## 3 100690 Amridge University
## 4 100706 University of Alabama in Huntsville
## 5 100724 Alabama State University
## 6 100733 University of Alabama System Office
## # ... with 68 more variables: IALIAS <chr>, ADDR <chr>, CITY <chr>,
## # STABBR <chr>, ZIP <chr>, FIPS <int>, OBEREG <int>, CHFNM <chr>,
## # CHFTITLE <chr>, GENTELE <dbl>, EIN <chr>, DUNS <chr>, OPEID <chr>,
## # OPEFLAG <int>, WEBADDR <chr>, ADMINURL <chr>, FAIDURL <chr>,
## # APPLURL <chr>, NPRICURL <chr>, VETURL <chr>, ATHURL <chr>,
## # DISAURL <chr>, SECTOR <int>, ICLEVEL <int>, CONTROL <int>,
## # HLOFFER <int>, UGOFFER <int>, GROFFER <int>, HDEGOFR1 <int>,
## # DEGGRANT <int>, HBCU <int>, HOSPITAL <int>, MEDICAL <int>,
## # TRIBAL <int>, LOCALE <int>, OPENPUBL <int>, ACT <chr>, NEWID <int>,
## # DEATHYR <int>, CLOSEDAT <chr>, CYACTIVE <int>, POSTSEC <int>,
## # PSEFLAG <int>, PSET4FLG <int>, RPTMTH <int>, INSTCAT <int>,
## # C15BASIC <int>, C15IPUG <int>, C15IPGRD <int>, C15UGPRF <int>,
## # C15ENPRF <int>, C15SZSET <int>, CCBASIC <int>, CARNEGIE <int>,
## # LANDGRNT <int>, INSTSIZE <int>, F1SYSTYP <int>, F1SYSNAM <chr>,
## # F1SYSCOD <int>, CBSA <int>, CBSATYPE <int>, CSA <int>, NECTA <int>,
## # COUNTYCD <int>, COUNTYNM <chr>, CNGDSTCD <int>, LONGITUD <dbl>,
## # LATITUDE <dbl>
summary(AU)
## UNITID INSTNM IALIAS ADDR
## Min. :100654 Length:7521 Length:7521 Length:7521
## 1st Qu.:173258 Class :character Class :character Class :character
## Median :227979 Mode :character Mode :character Mode :character
## Mean :291277
## 3rd Qu.:446224
## Max. :490018
##
## CITY STABBR ZIP FIPS
## Length:7521 Length:7521 Length:7521 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.:13.00
## Mode :character Mode :character Mode :character Median :29.00
## Mean :29.18
## 3rd Qu.:42.00
## Max. :78.00
##
## OBEREG CHFNM CHFTITLE GENTELE
## Min. :0.000 Length:7521 Length:7521 Min. :2.012e+09
## 1st Qu.:3.000 Class :character Class :character 1st Qu.:4.105e+09
## Median :5.000 Mode :character Mode :character Median :6.508e+09
## Mean :4.621 Mean :5.859e+12
## 3rd Qu.:6.000 3rd Qu.:8.433e+09
## Max. :9.000 Max. :9.785e+14
## NA's :67
## EIN DUNS OPEID OPEFLAG
## Length:7521 Length:7521 Length:7521 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:1.000
## Mode :character Mode :character Mode :character Median :1.000
## Mean :1.351
## 3rd Qu.:1.000
## Max. :7.000
##
## WEBADDR ADMINURL FAIDURL
## Length:7521 Length:7521 Length:7521
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## APPLURL NPRICURL VETURL
## Length:7521 Length:7521 Length:7521
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## ATHURL DISAURL SECTOR ICLEVEL
## Length:7521 Length:7521 Min. : 0.000 Min. :-3.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 1.000
## Mode :character Mode :character Median : 4.000 Median : 2.000
## Mean : 5.499 Mean : 1.806
## 3rd Qu.: 8.000 3rd Qu.: 3.000
## Max. :99.000 Max. : 3.000
##
## CONTROL HLOFFER UGOFFER GROFFER
## Min. :-3.000 Min. :-3.000 Min. :-3.000 Min. :-3.000
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 2.000 Median : 4.000 Median : 1.000 Median : 2.000
## Mean : 2.133 Mean : 4.543 Mean : 1.009 Mean : 1.658
## 3rd Qu.: 3.000 3rd Qu.: 7.000 3rd Qu.: 1.000 3rd Qu.: 2.000
## Max. : 3.000 Max. : 9.000 Max. : 2.000 Max. : 2.000
##
## HDEGOFR1 DEGGRANT HBCU HOSPITAL
## Min. :-3.00 Min. :-3.00 Min. :1.000 Min. :-2.0000
## 1st Qu.: 0.00 1st Qu.: 1.00 1st Qu.:2.000 1st Qu.:-2.0000
## Median :13.00 Median : 1.00 Median :2.000 Median : 2.0000
## Mean :17.34 Mean : 1.31 Mean :1.987 Mean : 0.3773
## 3rd Qu.:30.00 3rd Qu.: 2.00 3rd Qu.:2.000 3rd Qu.: 2.0000
## Max. :40.00 Max. : 2.00 Max. :2.000 Max. : 2.0000
##
## MEDICAL TRIBAL LOCALE OPENPUBL
## Min. :-2.000 Min. :1.000 Min. :-3.00 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.:2.000 1st Qu.:12.00 1st Qu.:1.0000
## Median : 2.000 Median :2.000 Median :21.00 Median :1.0000
## Mean : 1.696 Mean :1.995 Mean :19.54 Mean :0.9996
## 3rd Qu.: 2.000 3rd Qu.:2.000 3rd Qu.:22.00 3rd Qu.:1.0000
## Max. : 2.000 Max. :2.000 Max. :43.00 Max. :1.0000
##
## ACT NEWID DEATHYR CLOSEDAT
## Length:7521 Min. : -2 Min. : -2.00 Length:7521
## Class :character 1st Qu.: -2 1st Qu.: -2.00 Class :character
## Mode :character Median : -2 Median : -2.00 Mode :character
## Mean : 2966 Mean : 79.84
## 3rd Qu.: -2 3rd Qu.: -2.00
## Max. :489937 Max. :2016.00
##
## CYACTIVE POSTSEC PSEFLAG PSET4FLG
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.000 Median :1.000
## Mean :1.076 Mean :1.006 Mean :1.082 Mean :1.369
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :3.000 Max. :2.000 Max. :3.000 Max. :9.000
##
## RPTMTH INSTCAT C15BASIC C15IPUG
## Min. :-2.000 Min. :-2.000 Min. :-2.000 Min. :-2.000
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.:-2.000 1st Qu.:-2.000
## Median : 1.000 Median : 4.000 Median : 7.000 Median : 2.000
## Mean : 1.233 Mean : 3.502 Mean : 9.323 Mean : 4.592
## 3rd Qu.: 2.000 3rd Qu.: 6.000 3rd Qu.:20.000 3rd Qu.:12.000
## Max. : 3.000 Max. : 6.000 Max. :33.000 Max. :20.000
##
## C15IPGRD C15UGPRF C15ENPRF C15SZSET
## Min. :-2.000 Min. :-2.000 Min. :-2.0000 Min. :-2.000
## 1st Qu.:-2.000 1st Qu.:-2.000 1st Qu.:-2.0000 1st Qu.:-2.000
## Median : 0.000 Median : 1.000 Median : 1.0000 Median : 2.000
## Mean : 1.546 Mean : 3.264 Mean : 0.8827 Mean : 3.895
## 3rd Qu.: 2.000 3rd Qu.: 7.000 3rd Qu.: 3.0000 3rd Qu.: 8.000
## Max. :18.000 Max. :15.000 Max. : 7.0000 Max. :18.000
##
## CCBASIC CARNEGIE LANDGRNT INSTSIZE
## Min. :-3.000 Min. :-3.00 Min. :1.000 Min. :-2.000
## 1st Qu.:-3.000 1st Qu.:-3.00 1st Qu.:2.000 1st Qu.: 1.000
## Median : 3.000 Median :-3.00 Median :2.000 Median : 1.000
## Mean : 7.398 Mean :14.55 Mean :1.986 Mean : 1.539
## 3rd Qu.:18.000 3rd Qu.:40.00 3rd Qu.:2.000 3rd Qu.: 2.000
## Max. :33.000 Max. :60.00 Max. :2.000 Max. : 5.000
##
## F1SYSTYP F1SYSNAM F1SYSCOD CBSA
## Min. :-2.000 Length:7521 Min. : -2 Min. : -2
## 1st Qu.: 1.000 Class :character 1st Qu.: -2 1st Qu.:19100
## Median : 2.000 Mode :character Median : -2 Median :31080
## Mean : 1.353 Mean : 94070 Mean :29174
## 3rd Qu.: 2.000 3rd Qu.:200080 3rd Qu.:38900
## Max. : 2.000 Max. :400010 Max. :49780
##
## CBSATYPE CSA NECTA COUNTYCD
## Min. :-2.0000 Min. : -2.0 Min. : -2 Min. : -2
## 1st Qu.: 1.0000 1st Qu.:122.0 1st Qu.: -2 1st Qu.:13121
## Median : 1.0000 Median :288.0 Median : -2 Median :29183
## Mean : 0.9734 Mean :260.5 Mean : 3822 Mean :29235
## 3rd Qu.: 1.0000 3rd Qu.:408.0 3rd Qu.: -2 3rd Qu.:42039
## Max. : 2.0000 Max. :566.0 Max. :79600 Max. :78030
##
## COUNTYNM CNGDSTCD LONGITUD LATITUDE
## Length:7521 Min. : -2 Min. :-170.74 Min. :-14.32
## Class :character 1st Qu.:1307 1st Qu.: -97.36 1st Qu.: 33.95
## Mode :character Median :2906 Median : -86.66 Median : 38.77
## Mean :2927 Mean : -90.47 Mean : 37.37
## 3rd Qu.:4206 3rd Qu.: -79.23 3rd Qu.: 41.33
## Max. :7898 Max. : 171.38 Max. : 71.32
##
2.subset the data using dplyr to include five states
AU5 <- filter(AU, STABBR == c("CA", "TX", "FL", "NY", "PA"))
## Warning in STABBR == c("CA", "TX", "FL", "NY", "PA"): longer object length
## is not a multiple of shorter object length
head(AU5)
## # A tibble: 6 x 70
## UNITID INSTNM IALIAS
## <int> <chr> <chr>
## 1 108250 ITT Technical Institute-Rancho Cordova <NA>
## 2 108807 Allan Hancock College <NA>
## 3 109040 American Career College-Los Angeles <NA>
## 4 109721 Associated Technical College-Los Angeles <NA>
## 5 109934 Bellus Academy-National City Bellus Academy
## 6 110219 Bryan University <NA>
## # ... with 67 more variables: ADDR <chr>, CITY <chr>, STABBR <chr>,
## # ZIP <chr>, FIPS <int>, OBEREG <int>, CHFNM <chr>, CHFTITLE <chr>,
## # GENTELE <dbl>, EIN <chr>, DUNS <chr>, OPEID <chr>, OPEFLAG <int>,
## # WEBADDR <chr>, ADMINURL <chr>, FAIDURL <chr>, APPLURL <chr>,
## # NPRICURL <chr>, VETURL <chr>, ATHURL <chr>, DISAURL <chr>,
## # SECTOR <int>, ICLEVEL <int>, CONTROL <int>, HLOFFER <int>,
## # UGOFFER <int>, GROFFER <int>, HDEGOFR1 <int>, DEGGRANT <int>,
## # HBCU <int>, HOSPITAL <int>, MEDICAL <int>, TRIBAL <int>, LOCALE <int>,
## # OPENPUBL <int>, ACT <chr>, NEWID <int>, DEATHYR <int>, CLOSEDAT <chr>,
## # CYACTIVE <int>, POSTSEC <int>, PSEFLAG <int>, PSET4FLG <int>,
## # RPTMTH <int>, INSTCAT <int>, C15BASIC <int>, C15IPUG <int>,
## # C15IPGRD <int>, C15UGPRF <int>, C15ENPRF <int>, C15SZSET <int>,
## # CCBASIC <int>, CARNEGIE <int>, LANDGRNT <int>, INSTSIZE <int>,
## # F1SYSTYP <int>, F1SYSNAM <chr>, F1SYSCOD <int>, CBSA <int>,
## # CBSATYPE <int>, CSA <int>, NECTA <int>, COUNTYCD <int>,
## # COUNTYNM <chr>, CNGDSTCD <int>, LONGITUD <dbl>, LATITUDE <dbl>
3.using dplyr, count the number of universities by a categorical variable; note: you may need to change the variable from character to another format first; provide a tibble output from dplyr
Sector <- AU5 %>% group_by(SECTOR) %>% summarize(count = n()) %>% arrange(count)
Sector
## # A tibble: 11 x 2
## SECTOR count
## <int> <int>
## 1 99 1
## 2 0 2
## 3 8 6
## 4 5 14
## 5 7 16
## 6 1 43
## 7 3 48
## 8 4 50
## 9 6 73
## 10 2 113
## 11 9 120
4.make three ggplot graphs of your findings
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
g <- ggplot(AU5, aes(STABBR, fill = STABBR))
g + geom_bar(aes()) + ggtitle("Bar Plot Showing The Count of University by State")
pie(table(AU5$STABBR), clockwise = TRUE, main = "Pie Chart of Universities in different states")
d <- ggplot(AU5, aes(ICLEVEL))
d + geom_bar(aes(fill=STABBR), width = 0.5) +
labs(title="Histogram Showing ICLEVEL Across States") +
annotate("text", x = 2, y = 250, label = "Lowest Count")
Summary: This report contains a brief analysis of University information in 5 US States including California (CA), Taxes (TX), Florida (Fl), Pennsylvania (PA) and New York (NY).
I used folloiwng methods in the analysis: adjusting missing values, ordering states by universities, conducting differenciation analysis between them and conducting comparison analysis with the help of the data visulization. Results of the data analysed shows that CA has a greatest number of universities.The greater proportion of ICLEVE are 1, followed by 3 then 2.