###############################################
#SalaryExploration.R
# GOAL:
# We are going to read salary information of
# faculty members at over 1100 US colleges
# and learn how to manipulate it.
#
# We have amply COMMENTED the code so that
# beginners will be comfortable with the
# operations being carried out...
###############################################
# This is the library to compute skew in
# a supplied dataset. If you have not already
# done it, please install the e1071 package
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.2
# We have two options to read the data
# Option 1: Read the data directly from a file
# Make sure that UnivSalaries.csv is accessible
# otherwise, the script will not run
#
fac.sal <- read.csv("UnivSalaries.csv",
header=FALSE)
head(fac.sal)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 1061 Alaska Pacific University AK IIB 454 382 362 382 567 485 471 487
## 2 1063 Univ.Alaska-Fairbanks AK I 686 560 432 508 914 753 572 677
## 3 1065 Univ.Alaska-Southeast AK IIA 533 494 329 415 716 663 442 559
## 4 11462 Univ.Alaska-Anchorage AK IIA 612 507 414 498 825 681 557 670
## 5 1002 Alabama Agri.&Mech. Univ. AL IIA 442 369 310 350 530 444 376 423
## 6 1004 University of Montevallo AL IIA 441 385 310 388 542 473 383 477
## V13 V14 V15 V16 V17
## 1 6 11 9 4 32
## 2 74 125 118 40 404
## 3 9 26 20 9 70
## 4 115 124 101 21 392
## 5 59 77 102 24 262
## 6 57 33 35 2 127
str(fac.sal)
## 'data.frame': 1161 obs. of 17 variables:
## $ V1 : int 1061 1063 1065 11462 1002 1004 1008 1009 1012 1016 ...
## $ V2 : Factor w/ 1140 levels "Abilene Christian University",..: 6 936 937 935 5 1001 36 38 81 898 ...
## $ V3 : Factor w/ 52 levels "AK","AL","AR",..: 1 1 1 1 2 2 2 2 2 2 ...
## $ V4 : Factor w/ 4 levels "I","IIA","IIB",..: 3 1 2 2 2 2 3 1 3 3 ...
## $ V5 : Factor w/ 428 levels "*","1009","270",..: 133 351 212 287 121 120 145 258 177 185 ...
## $ V6 : Factor w/ 304 levels "*","234","247",..: 111 281 223 235 98 114 123 166 108 141 ...
## $ V7 : Factor w/ 236 levels "*","199","214",..: 120 190 87 172 68 68 109 132 80 117 ...
## $ V8 : int 382 508 415 498 350 388 396 455 401 411 ...
## $ V9 : Factor w/ 486 levels "*","1002","1016",..: 177 456 315 398 141 152 168 294 261 215 ...
## $ V10: Factor w/ 374 levels "*","292","295",..: 142 362 308 321 101 130 133 184 158 165 ...
## $ V11: Factor w/ 308 levels "*","246","261",..: 174 268 145 258 80 87 130 154 107 148 ...
## $ V12: int 487 677 559 670 423 477 478 546 523 503 ...
## $ V13: int 6 74 9 115 59 57 20 366 34 67 ...
## $ V14: int 11 125 26 124 77 33 18 354 25 40 ...
## $ V15: int 9 118 20 101 102 35 30 301 27 66 ...
## $ V16: int 4 40 9 21 24 2 0 66 3 27 ...
## $ V17: int 32 404 70 392 262 127 68 1109 89 200 ...
# Here's Option 2, which you may uncomment and
# try out if the URL is accessible from your location
# To uncomment/comment many lines at once
# use Ctrl-Shift-C or Cmd-Shift-C
# Read the data from the Internet site
# salaryURL <- url("http://lib.stat.cmu.edu/datasets/colleges/aaup.data")
# fac.sal <- read.csv(salaryURL,
# header = FALSE)
######################################
# Function to identify whether the
# supplied row contains a *
missingDataInRow <- function(row) {
# This is a flag that is set to
# FALSE by default
contains <- FALSE
# Set the flag to TRUE if any cell
# in the entered row contains a "*"
# indicating missing information
for(i in 1:length(row)) {
if (row[i] == "*") contains <- TRUE
}
return(contains)
}
# Missing value treatment: our approach
# is to delete rows with data entry ='*'
fac.sal$missing <- apply(fac.sal,
MARGIN=1, # row-wise
FUN=missingDataInRow)
# This is the CLEANED dataset which
# we shall work with for the rest of
# this program
fac.sal.new <- subset(fac.sal,
missing == FALSE)
# Renaming headers for better understanding
# Sal = Salary, Comp = Total Compensation
colnames(fac.sal.new)<-c("Fed_ID No.",
"University/College",
"State",
"Type",
"AveSal_Prof",
"AveSal_Asc.Prof",
"AveSal_Ast.Prof",
"AveSal_All",
"AveComp_Prof",
"AveComp_Asc.Prof",
"AveComp_Ast.Prof",
"AveComp_All",
"Num_Prof",
"Num_Asc.Prof",
"Num_Ast.Prof",
"Num_Instructors",
"Num_All",
"MissingData")
# Check out the newly cleaned dataset
# by displaying it on a tab
#View(fac.sal.new)
# Function to onverte the input data
# from factor (categorical) to numeric
convertToNum <- function(x) {
return(as.numeric(as.character(x)))
}
# Convert currency data represented as categorical
# into numerical
fac.sal.new$AveSal_Prof <- convertToNum(fac.sal.new$AveSal_Prof)
fac.sal.new$AveSal_Asc.Prof <- convertToNum(fac.sal.new$AveSal_Asc.Prof)
fac.sal.new$AveSal_Ast.Prof <- convertToNum(fac.sal.new$AveSal_Ast.Prof)
fac.sal.new$AveComp_Prof <- convertToNum(fac.sal.new$AveComp_Prof)
fac.sal.new$AveComp_Asc.Prof <- convertToNum(fac.sal.new$AveComp_Asc.Prof)
fac.sal.new$AveComp_Ast.Prof <- convertToNum(fac.sal.new$AveComp_Ast.Prof)
# Plotting histograms of salaries/compensation
# for each faculty position
# Display the plots in 2 rows & 3 columns
# Row 1: Salary Row 2: Compensation
par(mfrow=c(2,3))
hist(fac.sal.new$AveSal_Asc.Prof)
hist(fac.sal.new$AveSal_Prof)
hist(fac.sal.new$AveSal_Ast.Prof)
hist(fac.sal.new$AveComp_Prof)
hist(fac.sal.new$AveComp_Ast.Prof)
hist(fac.sal.new$AveComp_Asc.Prof)

# Display a few summary statistics
cat("Quick summary for Average Compensation of Profs\n")
## Quick summary for Average Compensation of Profs
print(summary(fac.sal.new$AveComp_Prof))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 319.0 549.2 637.0 656.4 755.0 1236.0
cat("Quick summary for Average Compensation of Associate Profs\n")
## Quick summary for Average Compensation of Associate Profs
print(summary(fac.sal.new$AveComp_Asc.Prof))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 295.0 462.0 522.0 528.6 586.8 909.0
cat("\nSkew in the numbers of Assistant Profs: ",
skewness(fac.sal.new$Num_Ast.Prof), "\n\n")
##
## Skew in the numbers of Assistant Profs: 2.114205
# Visually check for outliers within
# the average compendation variable.
# You may cycle through the remaining
# variables for practice
par(mfrow=c(1,1))
boxplot(fac.sal.new$AveComp_Prof,
horizontal = TRUE)
title(main="Average compensation of Profs across Colleges")

# Calculating which college pays the
# maximum salary to profs
# The summary function returns a
# 7-number summary for the dataset
salaryInfo <- summary(fac.sal.new$AveSal_Prof)
maxSal <- salaryInfo[6]
# Figure out which row(s) in the dataset
# carry this maximum salary figure - there
# could potentially be multiple rows...
index <- which(fac.sal.new$AveSal_Prof == maxSal,
arr.ind=TRUE)
# The second column in the dataset contains
# the name of the college
maxCollege <- as.character(fac.sal.new[index, 2])
# Output a message
cat(maxCollege, "pays its professors the most\n\n")
## Harvard University pays its professors the most
# Calculating which college has the
# least number of Assitant Profs
numAstProfInfo <- summary(fac.sal.new$Num_Ast.Prof)
minProfs <- numAstProfInfo[1]
# Figure out which row(s) in the dataset
# carry this minimum number - there
# could potentially be multiple rows..
index <- which(fac.sal.new$Num_Ast.Prof == minProfs,
arr.ind=TRUE)
minColleges <- as.character(fac.sal.new[index, 2])
cat("\nColleges with the minimum number of Asst Profs:\n\n")
##
## Colleges with the minimum number of Asst Profs:
print(minColleges)
## [1] "Covenant College" "Judson College"
## [3] "Grace College" "Center for Creative Studies"
## [5] "Cooper Union for Adv.Sci&Art"
# Checking whether total number of faculty
# positions in Purdue Univ.-Main Campus
# is represented by the column of Faculty-All ranks
PUIndex <- which(fac.sal.new[ , 2] =="Purdue Univ.-Main Campus",
arr.ind=TRUE)
x <- fac.sal.new$Num_All[PUIndex]
y <- fac.sal.new$Num_Prof[PUIndex] +
fac.sal.new$Num_Asc.Prof[PUIndex] +
fac.sal.new$Num_Ast.Prof[PUIndex]+
fac.sal.new$Num_Instructors[PUIndex]
# No.of colleges where AveSal of a Professor
# rounds up to 80% of the AveComp
fac.sal.new$Percent <- fac.sal.new$AveSal_Prof/fac.sal.new$AveComp_Prof*100
percentComp <- round(fac.sal.new$Percent)
# Which colleges
index <- which(percentComp == 80, arr.ind=TRUE)
collegesWith80 <- as.character(fac.sal.new[index, 2])
cat("\nColleges with Salary as 80% of Compensation:\n\n")
##
## Colleges with Salary as 80% of Compensation:
print(collegesWith80)
## [1] "Alaska Pacific University" "Oakwood College"
## [3] "Samford University" "Univ. Arkansas at Pine Bluff"
## [5] "Arkansas Tech University" "Arkansas State Univ.-Main"
## [7] "Henderson State University" "Ouachita Baptist University"
## [9] "Southern Ark. Univ.-Main" "California Lutheran Univ."
## [11] "Cal.St.Univ-Fullerton" "Cal.St.Univ-Hayward"
## [13] "Cal.St.Univ-Long Beach" "Cal.St.Univ-Dominguez Hills"
## [15] "Cal.Poly.St.U-Sn Luis Obispo" "Cal.Poly.St.Univ-Pomona"
## [17] "Cal.St.Univ-Chico" "Cal.St.Univ-Fresno"
## [19] "San Diego State University" "San Jose State University"
## [21] "Sonoma State University" "Claremont McKenna College"
## [23] "Harvey Mudd College" "Pomona College"
## [25] "University of La Verne" "Mills College"
## [27] "Point Loma Nazarene College" "San Francisco State Univ."
## [29] "University of San Diego" "University of New Haven"
## [31] "Sacred Heart University" "University of Hartford"
## [33] "Florida Southern College" "Rollins College"
## [35] "University Central Florida" "Univ. of Hawaii at Manoa"
## [37] "Buena Vista College" "Grand View College"
## [39] "St. Ambrose University" "University of Northern Iowa"
## [41] "University of Idaho" "Greenville College"
## [43] "McKendree College" "North Central College"
## [45] "Illinois Benedictine College" "Wheaton College"
## [47] "Butler University" "Earlham College"
## [49] "Franklin College of Indiana" "Indiana Wesleyan University"
## [51] "Saint Mary-of-the-Woods Coll" "Bethel College"
## [53] "Fort Hays State University" "Kansas State University"
## [55] "McPherson College" "Centre College"
## [57] "Eastern Kentucky University" "Kentucky State University"
## [59] "Morehead State University" "Centenary College of La."
## [61] "Bentley College" "Bradford College"
## [63] "Brandeis University" "Massachusetts Inst. of Tech."
## [65] "Fitchburg State College" "Mount Holyoke College"
## [67] "Regis College" "Univ. Massachussets-Amherst"
## [69] "Western New England College" "Goucher College"
## [71] "Morgan State University" "Salisbury State University"
## [73] "Bowdoin College" "Univ.of Maine at Orono"
## [75] "Univ.of Maine at Machias" "Concordia College"
## [77] "Univ.of Michigan-Flint" "Western Michigan University"
## [79] "University of St. Thomas" "William Jewell College"
## [81] "Belmont Abbey College" "Elon College"
## [83] "Guilford College" "High Point University"
## [85] "North Carolina Wesleyan Coll" "Univ. of North Dakota-Main"
## [87] "Valley City State University" "Plymouth State College"
## [89] "Saint Peter's College" "Stevens Institute of Tech."
## [91] "New Mexico Highlands Univ." "Canisius College"
## [93] "Clarkson University" "Hartwick College"
## [95] "Ithaca College" "Pace University"
## [97] "Rochester Institute of Tech." "Russell Sage College"
## [99] "Sarah Lawrence College" "Baldwin-Wallace College"
## [101] "Capital University" "Case Western Reserve Univ."
## [103] "Cedarville College" "Malone College"
## [105] "Ohio Northern University" "Ohio University-Athens"
## [107] "Univ. of Akron- Main Campus" "Xavier University"
## [109] "Cameron University" "University of Oklahoma"
## [111] "Lewis & Clark College" "Cedar Crest College"
## [113] "Drexel University" "Duquesne University"
## [115] "Holy Family College" "La Salle University"
## [117] "Lebanon Valley College" "Carlow College"
## [119] "Villanova University" "Washington & Jefferson Coll."
## [121] "Wilson College" "Salve Regina University"
## [123] "Central Wesleyan College" "Converse College"
## [125] "Winthrop University" "Tennessee State University"
## [127] "Union University" "Univ. of Tennessee- Memphis"
## [129] "Houston Baptist University" "Midwestern State University"
## [131] "Univ. of Texas-Pan Amer.Univ" "Southern Methodist Univ."
## [133] "Southwest Texas State Univ." "Texas Tech University"
## [135] "Texas Wesleyan University" "Texas Woman's University"
## [137] "Univ. of Texas at Arlington" "West Texas A&M University"
## [139] "Univ. of Texas -San Antonio" "Texas A&M Univ.at Galveston"
## [141] "Univ. of Houston-Clear Lake" "Univ. of Houston-Downtown"
## [143] "Bridgewater College" "Longwood College"
## [145] "Saint Paul's College" "Virginia Military Institute"
## [147] "Norfolk State University" "Green Mountain College"
## [149] "Eastern Washington Univ." "Pacific Lutheran University"
## [151] "Marian Coll.of Fond du Lac" "Marquette University"
## [153] "Mount Mary College" "Univ. Wisconsin-Madison"
## [155] "University of Charleston"