###############################################
#SalaryExploration.R

# GOAL:
# We are going to read salary information of
# faculty members at over 1100 US colleges 
# and learn how to manipulate it.
#
# We have amply COMMENTED the code so that
# beginners will be comfortable with the 
# operations being carried out...
###############################################

# This is the library to compute skew in 
# a supplied dataset. If you have not already 
# done it, please install the e1071 package
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.2
# We have two options to read the data
# Option 1: Read the data directly from a file
# Make sure that UnivSalaries.csv is accessible
# otherwise, the script will not run
#
fac.sal <- read.csv("UnivSalaries.csv",
                    header=FALSE)
head(fac.sal)
##      V1                        V2 V3  V4  V5  V6  V7  V8  V9 V10 V11 V12
## 1  1061 Alaska Pacific University AK IIB 454 382 362 382 567 485 471 487
## 2  1063     Univ.Alaska-Fairbanks AK   I 686 560 432 508 914 753 572 677
## 3  1065     Univ.Alaska-Southeast AK IIA 533 494 329 415 716 663 442 559
## 4 11462     Univ.Alaska-Anchorage AK IIA 612 507 414 498 825 681 557 670
## 5  1002 Alabama Agri.&Mech. Univ. AL IIA 442 369 310 350 530 444 376 423
## 6  1004  University of Montevallo AL IIA 441 385 310 388 542 473 383 477
##   V13 V14 V15 V16 V17
## 1   6  11   9   4  32
## 2  74 125 118  40 404
## 3   9  26  20   9  70
## 4 115 124 101  21 392
## 5  59  77 102  24 262
## 6  57  33  35   2 127
str(fac.sal)
## 'data.frame':    1161 obs. of  17 variables:
##  $ V1 : int  1061 1063 1065 11462 1002 1004 1008 1009 1012 1016 ...
##  $ V2 : Factor w/ 1140 levels "Abilene Christian University",..: 6 936 937 935 5 1001 36 38 81 898 ...
##  $ V3 : Factor w/ 52 levels "AK","AL","AR",..: 1 1 1 1 2 2 2 2 2 2 ...
##  $ V4 : Factor w/ 4 levels "I","IIA","IIB",..: 3 1 2 2 2 2 3 1 3 3 ...
##  $ V5 : Factor w/ 428 levels "*","1009","270",..: 133 351 212 287 121 120 145 258 177 185 ...
##  $ V6 : Factor w/ 304 levels "*","234","247",..: 111 281 223 235 98 114 123 166 108 141 ...
##  $ V7 : Factor w/ 236 levels "*","199","214",..: 120 190 87 172 68 68 109 132 80 117 ...
##  $ V8 : int  382 508 415 498 350 388 396 455 401 411 ...
##  $ V9 : Factor w/ 486 levels "*","1002","1016",..: 177 456 315 398 141 152 168 294 261 215 ...
##  $ V10: Factor w/ 374 levels "*","292","295",..: 142 362 308 321 101 130 133 184 158 165 ...
##  $ V11: Factor w/ 308 levels "*","246","261",..: 174 268 145 258 80 87 130 154 107 148 ...
##  $ V12: int  487 677 559 670 423 477 478 546 523 503 ...
##  $ V13: int  6 74 9 115 59 57 20 366 34 67 ...
##  $ V14: int  11 125 26 124 77 33 18 354 25 40 ...
##  $ V15: int  9 118 20 101 102 35 30 301 27 66 ...
##  $ V16: int  4 40 9 21 24 2 0 66 3 27 ...
##  $ V17: int  32 404 70 392 262 127 68 1109 89 200 ...
# Here's Option 2, which you may uncomment and
# try out if the URL is accessible from your location

# To uncomment/comment many lines at once
# use Ctrl-Shift-C or Cmd-Shift-C

# Read the data from the Internet site
# salaryURL <- url("http://lib.stat.cmu.edu/datasets/colleges/aaup.data")
# fac.sal <- read.csv(salaryURL,
#                     header = FALSE)

######################################
# Function to identify whether the
# supplied row contains a *
missingDataInRow <- function(row) {
  
  # This is a flag that is set to
  # FALSE by default
  contains <- FALSE
  
  # Set the flag to TRUE if any cell
  # in the entered row contains a "*"
  # indicating missing information
  for(i in 1:length(row)) {
    if (row[i] == "*") contains <- TRUE
  }
  return(contains)
}

# Missing value treatment: our approach
# is to delete rows with data entry ='*'
fac.sal$missing <- apply(fac.sal,
                         MARGIN=1, # row-wise
                         FUN=missingDataInRow)
  
# This is the CLEANED dataset which 
# we shall work with for the rest of 
# this program
fac.sal.new <- subset(fac.sal, 
                      missing == FALSE)
                      
# Renaming headers for better understanding
# Sal = Salary, Comp = Total Compensation
colnames(fac.sal.new)<-c("Fed_ID No.",
                         "University/College",
                         "State",
                         "Type",
                         "AveSal_Prof",
                         "AveSal_Asc.Prof",
                         "AveSal_Ast.Prof",
                         "AveSal_All",
                         "AveComp_Prof",
                         "AveComp_Asc.Prof",
                         "AveComp_Ast.Prof",
                         "AveComp_All",
                         "Num_Prof",
                         "Num_Asc.Prof",
                         "Num_Ast.Prof",
                         "Num_Instructors",
                         "Num_All",
                         "MissingData")

# Check out the newly cleaned dataset
# by displaying it on a tab
#View(fac.sal.new)

# Function to onverte the input data 
# from factor (categorical) to numeric
convertToNum <- function(x) {
  return(as.numeric(as.character(x)))
}

# Convert currency data represented as categorical 
# into numerical
fac.sal.new$AveSal_Prof <- convertToNum(fac.sal.new$AveSal_Prof)
fac.sal.new$AveSal_Asc.Prof <- convertToNum(fac.sal.new$AveSal_Asc.Prof)
fac.sal.new$AveSal_Ast.Prof <- convertToNum(fac.sal.new$AveSal_Ast.Prof)
fac.sal.new$AveComp_Prof <- convertToNum(fac.sal.new$AveComp_Prof)
fac.sal.new$AveComp_Asc.Prof <- convertToNum(fac.sal.new$AveComp_Asc.Prof)
fac.sal.new$AveComp_Ast.Prof <- convertToNum(fac.sal.new$AveComp_Ast.Prof)

# Plotting histograms of salaries/compensation 
# for each faculty position

# Display the plots in 2 rows & 3 columns
# Row 1: Salary Row 2: Compensation
par(mfrow=c(2,3))
hist(fac.sal.new$AveSal_Asc.Prof)
hist(fac.sal.new$AveSal_Prof)
hist(fac.sal.new$AveSal_Ast.Prof)
hist(fac.sal.new$AveComp_Prof)
hist(fac.sal.new$AveComp_Ast.Prof)
hist(fac.sal.new$AveComp_Asc.Prof)

# Display a few summary statistics
cat("Quick summary for Average Compensation of Profs\n")
## Quick summary for Average Compensation of Profs
print(summary(fac.sal.new$AveComp_Prof))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   319.0   549.2   637.0   656.4   755.0  1236.0
cat("Quick summary for Average Compensation of Associate Profs\n")
## Quick summary for Average Compensation of Associate Profs
print(summary(fac.sal.new$AveComp_Asc.Prof))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   295.0   462.0   522.0   528.6   586.8   909.0
cat("\nSkew in the numbers of Assistant Profs: ",
    skewness(fac.sal.new$Num_Ast.Prof), "\n\n")
## 
## Skew in the numbers of Assistant Profs:  2.114205
# Visually check for outliers within 
# the average compendation variable. 
# You may cycle through the remaining 
# variables for practice
par(mfrow=c(1,1))
boxplot(fac.sal.new$AveComp_Prof, 
        horizontal = TRUE)
title(main="Average compensation of Profs across Colleges")

# Calculating which college pays the 
# maximum salary to profs
# The summary function returns a 
# 7-number summary for the dataset
salaryInfo <- summary(fac.sal.new$AveSal_Prof)
maxSal <- salaryInfo[6]

# Figure out which row(s) in the dataset
# carry this maximum salary figure - there 
# could potentially be multiple rows...
index <- which(fac.sal.new$AveSal_Prof == maxSal,
               arr.ind=TRUE)

# The second column in the dataset contains
# the name of the college
maxCollege <- as.character(fac.sal.new[index, 2])

# Output a message
cat(maxCollege, "pays its professors the most\n\n")
## Harvard University pays its professors the most
# Calculating which college has the 
# least number of Assitant Profs
numAstProfInfo <- summary(fac.sal.new$Num_Ast.Prof)
minProfs <- numAstProfInfo[1]
  
# Figure out which row(s) in the dataset
# carry this minimum number - there 
# could potentially be multiple rows..
index <- which(fac.sal.new$Num_Ast.Prof == minProfs, 
               arr.ind=TRUE)

minColleges <- as.character(fac.sal.new[index, 2])
cat("\nColleges with the minimum number of Asst Profs:\n\n")
## 
## Colleges with the minimum number of Asst Profs:
print(minColleges)
## [1] "Covenant College"             "Judson College"              
## [3] "Grace College"                "Center for Creative Studies" 
## [5] "Cooper Union for Adv.Sci&Art"
# Checking whether total number of faculty 
# positions in Purdue Univ.-Main Campus 
# is represented by the column of Faculty-All ranks 
PUIndex <- which(fac.sal.new[ , 2] =="Purdue Univ.-Main Campus",
                 arr.ind=TRUE)

x <- fac.sal.new$Num_All[PUIndex]
y <- fac.sal.new$Num_Prof[PUIndex] +
  fac.sal.new$Num_Asc.Prof[PUIndex] +
  fac.sal.new$Num_Ast.Prof[PUIndex]+
  fac.sal.new$Num_Instructors[PUIndex]

# No.of colleges where AveSal of a Professor 
# rounds up to 80% of the AveComp
fac.sal.new$Percent <- fac.sal.new$AveSal_Prof/fac.sal.new$AveComp_Prof*100
percentComp <- round(fac.sal.new$Percent)

# Which colleges 
index <- which(percentComp == 80, arr.ind=TRUE)
collegesWith80 <- as.character(fac.sal.new[index, 2])
cat("\nColleges with Salary as 80% of Compensation:\n\n")
## 
## Colleges with Salary as 80% of Compensation:
print(collegesWith80)
##   [1] "Alaska Pacific University"    "Oakwood College"             
##   [3] "Samford University"           "Univ. Arkansas at Pine Bluff"
##   [5] "Arkansas Tech University"     "Arkansas State Univ.-Main"   
##   [7] "Henderson State University"   "Ouachita Baptist University" 
##   [9] "Southern Ark. Univ.-Main"     "California Lutheran Univ."   
##  [11] "Cal.St.Univ-Fullerton"        "Cal.St.Univ-Hayward"         
##  [13] "Cal.St.Univ-Long Beach"       "Cal.St.Univ-Dominguez Hills" 
##  [15] "Cal.Poly.St.U-Sn Luis Obispo" "Cal.Poly.St.Univ-Pomona"     
##  [17] "Cal.St.Univ-Chico"            "Cal.St.Univ-Fresno"          
##  [19] "San Diego State University"   "San Jose State University"   
##  [21] "Sonoma State University"      "Claremont McKenna College"   
##  [23] "Harvey Mudd College"          "Pomona College"              
##  [25] "University of La Verne"       "Mills College"               
##  [27] "Point Loma Nazarene College"  "San Francisco State Univ."   
##  [29] "University of San Diego"      "University of New Haven"     
##  [31] "Sacred Heart University"      "University of Hartford"      
##  [33] "Florida Southern College"     "Rollins College"             
##  [35] "University Central Florida"   "Univ. of Hawaii at Manoa"    
##  [37] "Buena Vista College"          "Grand View College"          
##  [39] "St. Ambrose University"       "University of Northern Iowa" 
##  [41] "University of Idaho"          "Greenville College"          
##  [43] "McKendree College"            "North Central College"       
##  [45] "Illinois Benedictine College" "Wheaton College"             
##  [47] "Butler University"            "Earlham College"             
##  [49] "Franklin College of Indiana"  "Indiana Wesleyan University" 
##  [51] "Saint Mary-of-the-Woods Coll" "Bethel College"              
##  [53] "Fort Hays State University"   "Kansas State University"     
##  [55] "McPherson College"            "Centre College"              
##  [57] "Eastern Kentucky University"  "Kentucky State University"   
##  [59] "Morehead State University"    "Centenary College of La."    
##  [61] "Bentley College"              "Bradford College"            
##  [63] "Brandeis University"          "Massachusetts Inst. of Tech."
##  [65] "Fitchburg State College"      "Mount Holyoke College"       
##  [67] "Regis College"                "Univ. Massachussets-Amherst" 
##  [69] "Western New England College"  "Goucher College"             
##  [71] "Morgan State University"      "Salisbury State University"  
##  [73] "Bowdoin College"              "Univ.of Maine at Orono"      
##  [75] "Univ.of Maine at Machias"     "Concordia College"           
##  [77] "Univ.of Michigan-Flint"       "Western Michigan University" 
##  [79] "University of St. Thomas"     "William Jewell College"      
##  [81] "Belmont Abbey College"        "Elon College"                
##  [83] "Guilford College"             "High Point University"       
##  [85] "North Carolina Wesleyan Coll" "Univ. of North Dakota-Main"  
##  [87] "Valley City State University" "Plymouth State College"      
##  [89] "Saint Peter's College"        "Stevens Institute of Tech."  
##  [91] "New Mexico Highlands Univ."   "Canisius College"            
##  [93] "Clarkson University"          "Hartwick College"            
##  [95] "Ithaca College"               "Pace University"             
##  [97] "Rochester Institute of Tech." "Russell Sage College"        
##  [99] "Sarah Lawrence College"       "Baldwin-Wallace College"     
## [101] "Capital University"           "Case Western Reserve Univ."  
## [103] "Cedarville College"           "Malone College"              
## [105] "Ohio Northern University"     "Ohio University-Athens"      
## [107] "Univ. of Akron- Main Campus"  "Xavier University"           
## [109] "Cameron University"           "University of Oklahoma"      
## [111] "Lewis & Clark College"        "Cedar Crest College"         
## [113] "Drexel University"            "Duquesne University"         
## [115] "Holy Family College"          "La Salle University"         
## [117] "Lebanon Valley College"       "Carlow College"              
## [119] "Villanova University"         "Washington & Jefferson Coll."
## [121] "Wilson College"               "Salve Regina University"     
## [123] "Central Wesleyan College"     "Converse College"            
## [125] "Winthrop University"          "Tennessee State University"  
## [127] "Union University"             "Univ. of Tennessee- Memphis" 
## [129] "Houston Baptist University"   "Midwestern State University" 
## [131] "Univ. of Texas-Pan Amer.Univ" "Southern Methodist Univ."    
## [133] "Southwest Texas State Univ."  "Texas Tech University"       
## [135] "Texas Wesleyan University"    "Texas Woman's University"    
## [137] "Univ. of Texas at Arlington"  "West Texas A&M University"   
## [139] "Univ. of Texas -San Antonio"  "Texas A&M Univ.at Galveston" 
## [141] "Univ. of Houston-Clear Lake"  "Univ. of Houston-Downtown"   
## [143] "Bridgewater College"          "Longwood College"            
## [145] "Saint Paul's College"         "Virginia Military Institute" 
## [147] "Norfolk State University"     "Green Mountain College"      
## [149] "Eastern Washington Univ."     "Pacific Lutheran University" 
## [151] "Marian Coll.of Fond du Lac"   "Marquette University"        
## [153] "Mount Mary College"           "Univ. Wisconsin-Madison"     
## [155] "University of Charleston"