# Read the dataset given 'dummy-dataset.csv'
readdata <- read.csv('dummy-dataset.csv')
# Summarize the dataset
summary(readdata)
##  Employee_ID            Band                TRE              Salary       
##  Length:10000       Length:10000       Min.   : 0.3534   Min.   : 411329  
##  Class :character   Class :character   1st Qu.: 2.1641   1st Qu.: 507629  
##  Mode  :character   Mode  :character   Median : 5.5256   Median : 782675  
##                                        Mean   : 6.0067   Mean   : 758419  
##                                        3rd Qu.: 8.6298   3rd Qu.: 971380  
##                                        Max.   :23.2457   Max.   :1322810
# Draw histogram of the numeric fields
#help(hist)
hist(readdata$Salary, col = "lightblue", border = NULL, main = paste("Histogram of Salary"), xlab ="Salary", ylab ="Number of employees")

hist(readdata$TRE, col = "lightgreen", border = NULL, main = paste("Histogram of TRE"), xlab ="TRE", ylab ="Number of employees")

# Draw boxplot of Salary
#help(boxplot)
boxplot(readdata$Salary, col = "orange", border = "black", main = paste("Boxplot of Salary"), ylab ="Salary")

# Draw boxplots of Salary band-wise in one plot
boxplot(readdata$Salary ~ readdata$Band, col = "red", main = paste("Boxplot of Salary band-wise"), xlab = "Band", ylab ="Salary")

# Use t.test() function to test if the true mean salary of band B1 is 5L
#help(t.test)
readdataB1 <- readdata$Salary[readdata$Band=="B1"]
ttest_res <- t.test(readdataB1,mu=500000,alternative="two.sided", paired= FALSE, conf.level=0.95)
ttest_res
## 
##  One Sample t-test
## 
## data:  readdataB1
## t = -0.65789, df = 4013, p-value = 0.5106
## alternative hypothesis: true mean is not equal to 5e+05
## 95 percent confidence interval:
##  498965.7 500514.5
## sample estimates:
## mean of x 
##  499740.1
# Add another computed column to the dataframe and call it SalPerTRE. This is the ratio of the salary to TRE
SalPerTRE <- readdata$Salary/readdata$TRE
df <- data.frame(readdata$Employee_ID, readdata$Band, readdata$TRE, readdata$Salary)
df <- cbind(df, SalPerTRE)
head(df,10)
##    readdata.Employee_ID readdata.Band readdata.TRE readdata.Salary SalPerTRE
## 1                E00002            B1     1.975521          487820  246932.3
## 2                E00003            B1     2.559554          497495  194367.9
## 3                E00004            B1     2.530525          520264  205595.3
## 4                E00006            B1     1.628420          514960  316232.9
## 5                E00011            B1     2.739593          491793  179513.1
## 6                E00013            B1     2.403660          461647  192060.0
## 7                E00017            B1     1.643284          502342  305694.0
## 8                E00018            B1     1.755367          515990  293949.9
## 9                E00025            B1     2.512871          512754  204051.0
## 10               E00026            B1     1.242617          504069  405651.1
# Draw boxplots of SalPerTRE band-wise and observe if there are any outliers
boxplot(df$SalPerTRE ~ readdata$Band, col = "red", main = paste("Boxplot of Salary band-wise"), xlab = "Band", ylab ="Salary")