FacultySalary

Loading required packages

First import faculty salary data from blackboard

#S <- read.csv("P:/SUNY-Brockport/SUNY Brockport_1/teaching/Staitstical Methods/Fall2019/Project Faculty Salary/FS_1.csv")
library(readr)
FS<- read_csv("FS_1.csv")
View(FS)

Exploring Dataset attributes

head(FS)# looking at first few rows of the data

## # A tibble: 6 x 10
##   Gender Title Title2 `PhD graduated … `PhD School` Salary Dept  Dept2
##   <chr>  <chr> <chr>             <dbl> <chr>         <dbl> <chr> <chr>
## 1 Male   Asso… Assoc…             1999 SUNY, Stony…  96238 Stat… Stat…
## 2 Male   Dist… Full               1967 Stanford Un… 140649 Stat… Stat…
## 3 Male   Full… Full               1972 University …  68316 Stat… Stat…
## 4 Male   Full… Full               1983 University … 160177 Stat… Stat…
## 5 Male   Full… Full               1986 The Weizman… 175599 Stat… Stat…
## 6 Female Asso… Assoc…             2003 University …  91000 Stat… Stat…
## # … with 2 more variables: `PhDyear category` <chr>, yearcat <chr>

summary(FS)# looking at summary variables

##     Gender             Title              Title2         
##  Length:72          Length:72          Length:72         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  PhD graduated year  PhD School            Salary           Dept          
##  Min.   :1967       Length:72          Min.   : 10115   Length:72         
##  1st Qu.:1979       Class :character   1st Qu.: 68176   Class :character  
##  Median :1989       Mode  :character   Median : 87619   Mode  :character  
##  Mean   :1989                          Mean   : 97339                     
##  3rd Qu.:1999                          3rd Qu.:118125                     
##  Max.   :2008                          Max.   :196955                     
##  NA's   :17                                                               
##     Dept2           PhDyear category     yearcat         
##  Length:72          Length:72          Length:72         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##

names(FS)# Using the names function to see names of the variables

##  [1] "Gender"             "Title"              "Title2"            
##  [4] "PhD graduated year" "PhD School"         "Salary"            
##  [7] "Dept"               "Dept2"              "PhDyear category"  
## [10] "yearcat"

rownames(FS) # for row names

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
## [57] "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70"
## [71] "71" "72"

colnames(FS)   # for column names

##  [1] "Gender"             "Title"              "Title2"            
##  [4] "PhD graduated year" "PhD School"         "Salary"            
##  [7] "Dept"               "Dept2"              "PhDyear category"  
## [10] "yearcat"

# Using the attributes function to see the attributes of the data
attributes(FS)

## $names
##  [1] "Gender"             "Title"              "Title2"            
##  [4] "PhD graduated year" "PhD School"         "Salary"            
##  [7] "Dept"               "Dept2"              "PhDyear category"  
## [10] "yearcat"           
## 
## $class
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame" 
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## [70] 70 71 72
## 
## $spec
## cols(
##   Gender = col_character(),
##   Title = col_character(),
##   Title2 = col_character(),
##   `PhD graduated year` = col_double(),
##   `PhD School` = col_character(),
##   Salary = col_double(),
##   Dept = col_character(),
##   Dept2 = col_character(),
##   `PhDyear category` = col_character(),
##   yearcat = col_character()
## )

Below we use the first index and leave the second index blank. This indicates that we want all the variables for specific observations

FS_sub<- FS[c(1, 7, 8),]

#If the variables we want are in 
#consecutive (rows) columns, we can use the 
#colon notation rather than list them using the c function. 

FS_sub2<- FS[1:4,]

Merging two factor levels, it will merge and create new titles for the last dataset

To attach the new column to our original data we need to call it before runing the code

PS I am not runing this code as the data set already has title2 column. But you can use it as a guide to create a new or merge categories using an existing variable

Subsetting based on variable values and/or observations

#################
FS_sub3<-FS[which(FS$Gender =="Male"),]
head(FS_sub3)

## # A tibble: 6 x 10
##   Gender Title Title2 `PhD graduated … `PhD School` Salary Dept  Dept2
##   <chr>  <chr> <chr>             <dbl> <chr>         <dbl> <chr> <chr>
## 1 Male   Asso… Assoc…             1999 SUNY, Stony…  96238 Stat… Stat…
## 2 Male   Dist… Full               1967 Stanford Un… 140649 Stat… Stat…
## 3 Male   Full… Full               1972 University …  68316 Stat… Stat…
## 4 Male   Full… Full               1983 University … 160177 Stat… Stat…
## 5 Male   Full… Full               1986 The Weizman… 175599 Stat… Stat…
## 6 Male   Full… Full               1993 Australian …  94611 Stat… Stat…
## # … with 2 more variables: `PhDyear category` <chr>, yearcat <chr>

#In the following example we create the data frame FS4, 
#which contains only males with salary>60,000


FS4 <- subset(FS, Gender =="Male" & Salary > 60000, select = c(Gender, Salary, Dept, Title2))
head(FS4)

## # A tibble: 6 x 4
##   Gender Salary Dept       Title2   
##   <chr>   <dbl> <chr>      <chr>    
## 1 Male    96238 Statistics Associate
## 2 Male   140649 Statistics Full     
## 3 Male    68316 Statistics Full     
## 4 Male   160177 Statistics Full     
## 5 Male   175599 Statistics Full     
## 6 Male    94611 Statistics Full

Box PLot

There are many ways to construct boxplot such as boxplot() bwplot() etc.
bwplot requires package lattice, it provides better labels & titles
this one plots salary on y -axis title on x-axis and divides the data by Dept.

#par(mfrow=c(3,2)) 
# use the above option if you want plots together on one page 
#3 rows 2 columns
par(mfrow=c(1,1)) 
bwplot(Salary~Title2|Dept,data=FS4)

# this one plots salary on y-axis, dept on x-axis 
#and divides the data by gender, also it includes means

attach(FS)
bwplot(Salary~Dept|Gender,data=FS)

#bwplot(Salary~Dept2|Gender,data=FS,horizontal = F, las = 2)

There are many ways to construct boxplot such as boxplot() bwplot() etc. bwplot requires package mosaic, it provides better labels & titles thse ones plots salary on y -axis title on x-axis and divides the data by Dept and this one plots salary on y-axis, dept on x-axis and divides the data by gender, also it includes means respectively

bwplot(Salary~Title2|Dept,data=FS4)

# this one plots salary on y-axis, dept on x-axis and divides the data by gender, also it includes means

attach(FS)

## The following objects are masked from FS (pos = 3):
## 
##     Dept, Dept2, Gender, PhD graduated year, PhD School, PhDyear
##     category, Salary, Title, Title2, yearcat

bwplot(Salary~Dept2|Gender,data=FS)

Adding mean to box plot

#par(mfrow=c(3,2)) 
# use the above option if you want plots together on one page 
#3 rows 2 columns
par(mfrow=c(1,1)) 

#another way (note that bwplots includes means) 
#so this is highway) of plotting while adding means but 
#tapply is very handy function
 
boxplot(FS$Salary~ FS$Gender, 
        main="Male vs. Female Salaries",
        ylab="Salaries",
        xlab="Gender",
        horizontal = F, las = 3)
means<-tapply(FS$Salary, FS$Gender,mean)
points(means,col="red", pch=18)

Miscelaneous stuff for data cleaning: Example to remove data for a title lets say AssocProfCourtesy, #to rename or fix any labelling, e.g. lecturer is written as lecture on line 71, not that its needed but FYI

FS1<-FS[which(FS$Title!="AssocProfCourtesy"),]#to remove data for a title lets say  AssocProfCourtesy




FS1$Title[FS1$Title=="Lecture"]<-"Lecturer" #to rename or fix any labelling, e.g.  lecturer 
#is written as lecture on line 71, not that its needed but FYI

#To remove columns such as year of phd 
FS2<-subset(FS1,select=c(Gender, Salary, Dept, Title))

#to group the data based on each title
FS3<-subset(FS2, Title=="Associate Professor", 
                     select=c(Gender,Salary,Dept, Title))

FacultySalary

Tasneem Zaihra

October 3, 2019

Loading required packages

First import faculty salary data from blackboard

Exploring Dataset attributes

Below we use the first index and leave the second index blank. This indicates that we want all the variables for specific observations

Merging two factor levels, it will merge and create new titles for the last dataset

To attach the new column to our original data we need to call it before runing the code

PS I am not runing this code as the data set already has title2 column. But you can use it as a guide to create a new or merge categories using an existing variable

Subsetting based on variable values and/or observations

Box PLot

Adding mean to box plot

Miscelaneous stuff for data cleaning: Example to remove data for a title lets say AssocProfCourtesy, #to rename or fix any labelling, e.g. lecturer is written as lecture on line 71, not that its needed but FYI