Loading required packages
First import faculty salary data from blackboard
#S <- read.csv("P:/SUNY-Brockport/SUNY Brockport_1/teaching/Staitstical Methods/Fall2019/Project Faculty Salary/FS_1.csv")
library(readr)
FS<- read_csv("FS_1.csv")
View(FS)
Exploring Dataset attributes
head(FS)# looking at first few rows of the data
## # A tibble: 6 x 10
## Gender Title Title2 `PhD graduated … `PhD School` Salary Dept Dept2
## <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 Male Asso… Assoc… 1999 SUNY, Stony… 96238 Stat… Stat…
## 2 Male Dist… Full 1967 Stanford Un… 140649 Stat… Stat…
## 3 Male Full… Full 1972 University … 68316 Stat… Stat…
## 4 Male Full… Full 1983 University … 160177 Stat… Stat…
## 5 Male Full… Full 1986 The Weizman… 175599 Stat… Stat…
## 6 Female Asso… Assoc… 2003 University … 91000 Stat… Stat…
## # … with 2 more variables: `PhDyear category` <chr>, yearcat <chr>
summary(FS)# looking at summary variables
## Gender Title Title2
## Length:72 Length:72 Length:72
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## PhD graduated year PhD School Salary Dept
## Min. :1967 Length:72 Min. : 10115 Length:72
## 1st Qu.:1979 Class :character 1st Qu.: 68176 Class :character
## Median :1989 Mode :character Median : 87619 Mode :character
## Mean :1989 Mean : 97339
## 3rd Qu.:1999 3rd Qu.:118125
## Max. :2008 Max. :196955
## NA's :17
## Dept2 PhDyear category yearcat
## Length:72 Length:72 Length:72
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
names(FS)# Using the names function to see names of the variables
## [1] "Gender" "Title" "Title2"
## [4] "PhD graduated year" "PhD School" "Salary"
## [7] "Dept" "Dept2" "PhDyear category"
## [10] "yearcat"
rownames(FS) # for row names
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
## [57] "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70"
## [71] "71" "72"
colnames(FS) # for column names
## [1] "Gender" "Title" "Title2"
## [4] "PhD graduated year" "PhD School" "Salary"
## [7] "Dept" "Dept2" "PhDyear category"
## [10] "yearcat"
# Using the attributes function to see the attributes of the data
attributes(FS)
## $names
## [1] "Gender" "Title" "Title2"
## [4] "PhD graduated year" "PhD School" "Salary"
## [7] "Dept" "Dept2" "PhDyear category"
## [10] "yearcat"
##
## $class
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## [70] 70 71 72
##
## $spec
## cols(
## Gender = col_character(),
## Title = col_character(),
## Title2 = col_character(),
## `PhD graduated year` = col_double(),
## `PhD School` = col_character(),
## Salary = col_double(),
## Dept = col_character(),
## Dept2 = col_character(),
## `PhDyear category` = col_character(),
## yearcat = col_character()
## )
Below we use the first index and leave the second index blank. This indicates that we want all the variables for specific observations
FS_sub<- FS[c(1, 7, 8),]
#If the variables we want are in
#consecutive (rows) columns, we can use the
#colon notation rather than list them using the c function.
FS_sub2<- FS[1:4,]
Merging two factor levels, it will merge and create new titles for the last dataset
To attach the new column to our original data we need to call it before runing the code
PS I am not runing this code as the data set already has title2 column. But you can use it as a guide to create a new or merge categories using an existing variable
Subsetting based on variable values and/or observations
#################
FS_sub3<-FS[which(FS$Gender =="Male"),]
head(FS_sub3)
## # A tibble: 6 x 10
## Gender Title Title2 `PhD graduated … `PhD School` Salary Dept Dept2
## <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 Male Asso… Assoc… 1999 SUNY, Stony… 96238 Stat… Stat…
## 2 Male Dist… Full 1967 Stanford Un… 140649 Stat… Stat…
## 3 Male Full… Full 1972 University … 68316 Stat… Stat…
## 4 Male Full… Full 1983 University … 160177 Stat… Stat…
## 5 Male Full… Full 1986 The Weizman… 175599 Stat… Stat…
## 6 Male Full… Full 1993 Australian … 94611 Stat… Stat…
## # … with 2 more variables: `PhDyear category` <chr>, yearcat <chr>
#In the following example we create the data frame FS4,
#which contains only males with salary>60,000
FS4 <- subset(FS, Gender =="Male" & Salary > 60000, select = c(Gender, Salary, Dept, Title2))
head(FS4)
## # A tibble: 6 x 4
## Gender Salary Dept Title2
## <chr> <dbl> <chr> <chr>
## 1 Male 96238 Statistics Associate
## 2 Male 140649 Statistics Full
## 3 Male 68316 Statistics Full
## 4 Male 160177 Statistics Full
## 5 Male 175599 Statistics Full
## 6 Male 94611 Statistics Full
Box PLot
- There are many ways to construct boxplot such as boxplot() bwplot() etc.
- bwplot requires package lattice, it provides better labels & titles
- this one plots salary on y -axis title on x-axis and divides the data by Dept.
#par(mfrow=c(3,2))
# use the above option if you want plots together on one page
#3 rows 2 columns
par(mfrow=c(1,1))
bwplot(Salary~Title2|Dept,data=FS4)

# this one plots salary on y-axis, dept on x-axis
#and divides the data by gender, also it includes means
attach(FS)
bwplot(Salary~Dept|Gender,data=FS)

#bwplot(Salary~Dept2|Gender,data=FS,horizontal = F, las = 2)
There are many ways to construct boxplot such as boxplot() bwplot() etc. bwplot requires package mosaic, it provides better labels & titles thse ones plots salary on y -axis title on x-axis and divides the data by Dept and this one plots salary on y-axis, dept on x-axis and divides the data by gender, also it includes means respectively
bwplot(Salary~Title2|Dept,data=FS4)

# this one plots salary on y-axis, dept on x-axis and divides the data by gender, also it includes means
attach(FS)
## The following objects are masked from FS (pos = 3):
##
## Dept, Dept2, Gender, PhD graduated year, PhD School, PhDyear
## category, Salary, Title, Title2, yearcat
bwplot(Salary~Dept2|Gender,data=FS)

Adding mean to box plot
#par(mfrow=c(3,2))
# use the above option if you want plots together on one page
#3 rows 2 columns
par(mfrow=c(1,1))
#another way (note that bwplots includes means)
#so this is highway) of plotting while adding means but
#tapply is very handy function
boxplot(FS$Salary~ FS$Gender,
main="Male vs. Female Salaries",
ylab="Salaries",
xlab="Gender",
horizontal = F, las = 3)
means<-tapply(FS$Salary, FS$Gender,mean)
points(means,col="red", pch=18)

Miscelaneous stuff for data cleaning: Example to remove data for a title lets say AssocProfCourtesy, #to rename or fix any labelling, e.g. lecturer is written as lecture on line 71, not that its needed but FYI
FS1<-FS[which(FS$Title!="AssocProfCourtesy"),]#to remove data for a title lets say AssocProfCourtesy
FS1$Title[FS1$Title=="Lecture"]<-"Lecturer" #to rename or fix any labelling, e.g. lecturer
#is written as lecture on line 71, not that its needed but FYI
#To remove columns such as year of phd
FS2<-subset(FS1,select=c(Gender, Salary, Dept, Title))
#to group the data based on each title
FS3<-subset(FS2, Title=="Associate Professor",
select=c(Gender,Salary,Dept, Title))