# Read the dataset given 'dummy-dataset.csv'
readdata <- read.csv('dummy-dataset.csv')
# Summarize the dataset
summary(readdata)
## Employee_ID Band TRE Salary
## Length:10000 Length:10000 Min. : 0.3534 Min. : 411329
## Class :character Class :character 1st Qu.: 2.1641 1st Qu.: 507629
## Mode :character Mode :character Median : 5.5256 Median : 782675
## Mean : 6.0067 Mean : 758419
## 3rd Qu.: 8.6298 3rd Qu.: 971380
## Max. :23.2457 Max. :1322810
# Draw histogram of the numeric fields
#help(hist)
hist(readdata$Salary, col = "lightblue", border = NULL, main = paste("Histogram of Salary"), xlab ="Salary", ylab ="Number of employees")

hist(readdata$TRE, col = "lightgreen", border = NULL, main = paste("Histogram of TRE"), xlab ="TRE", ylab ="Number of employees")

# Draw boxplot of Salary
#help(boxplot)
boxplot(readdata$Salary, col = "orange", border = "black", main = paste("Boxplot of Salary"), ylab ="Salary")

# Draw boxplots of Salary band-wise in one plot
boxplot(readdata$Salary ~ readdata$Band, col = "red", main = paste("Boxplot of Salary band-wise"), xlab = "Band", ylab ="Salary")

# Use t.test() function to test if the true mean salary of band B1 is 5L
#help(t.test)
readdataB1 <- readdata$Salary[readdata$Band=="B1"]
ttest_res <- t.test(readdataB1,mu=500000,alternative="two.sided", paired= FALSE, conf.level=0.95)
ttest_res
##
## One Sample t-test
##
## data: readdataB1
## t = -0.65789, df = 4013, p-value = 0.5106
## alternative hypothesis: true mean is not equal to 5e+05
## 95 percent confidence interval:
## 498965.7 500514.5
## sample estimates:
## mean of x
## 499740.1
# Add another computed column to the dataframe and call it SalPerTRE. This is the ratio of the salary to TRE
SalPerTRE <- readdata$Salary/readdata$TRE
df <- data.frame(readdata$Employee_ID, readdata$Band, readdata$TRE, readdata$Salary)
df <- cbind(df, SalPerTRE)
head(df,10)
## readdata.Employee_ID readdata.Band readdata.TRE readdata.Salary SalPerTRE
## 1 E00002 B1 1.975521 487820 246932.3
## 2 E00003 B1 2.559554 497495 194367.9
## 3 E00004 B1 2.530525 520264 205595.3
## 4 E00006 B1 1.628420 514960 316232.9
## 5 E00011 B1 2.739593 491793 179513.1
## 6 E00013 B1 2.403660 461647 192060.0
## 7 E00017 B1 1.643284 502342 305694.0
## 8 E00018 B1 1.755367 515990 293949.9
## 9 E00025 B1 2.512871 512754 204051.0
## 10 E00026 B1 1.242617 504069 405651.1
# Draw boxplots of SalPerTRE band-wise and observe if there are any outliers
boxplot(df$SalPerTRE ~ readdata$Band, col = "red", main = paste("Boxplot of Salary band-wise"), xlab = "Band", ylab ="Salary")
