##Oral and Hands-on Exam on Page 111 and 112

#Problem 6

#Create a vector

area1 = c(30,12,35,65,24,59,68,57,100,61,32,45,92,56,44)
area2 = c(64,99,87,59,23,16,94,78,57,32,52,78,59,55,55)
area3 = c(100,59,78,97,84,64,53,59,89,88,94,66,57,62,64)
area4 = c(25,15,30,20,61,56,34,22,24,21,32,52,14,10,33)
area5 = c(59,63,81,110,65,112,132,145,163,120,84,99,105,68,75)
area6 = c(67,80,99,49,67,56,80,125,100,93,56,45,80,34,21)

#Create an individual boxplot for each area 

boxplot(area1, horizontal = TRUE)

boxplot(area2, horizontal = TRUE)

boxplot(area3, horizontal = TRUE)

boxplot(area4, horizontal = TRUE)

boxplot(area5, horizontal = TRUE)

boxplot(area6, horizontal = TRUE)

#Checking the length of the value of each area 

length(area1)
## [1] 15
length(area2)
## [1] 15
length(area3)
## [1] 15
length(area4)
## [1] 15
length(area5)
## [1] 15
length(area6)
## [1] 15
#Coverting the vector to dataframe

areadf = data.frame(area1,area2,area3,area4,area5,area6)
areadf
#Combining the boxplot 

boxplot(areadf,col="blue",horizontal = TRUE, xlab="All Area Noise Level")

summary(areadf)
##      area1           area2           area3            area4      
##  Min.   : 12.0   Min.   :16.00   Min.   : 53.00   Min.   :10.00  
##  1st Qu.: 33.5   1st Qu.:53.50   1st Qu.: 60.50   1st Qu.:20.50  
##  Median : 56.0   Median :59.00   Median : 66.00   Median :25.00  
##  Mean   : 52.0   Mean   :60.53   Mean   : 74.27   Mean   :29.93  
##  3rd Qu.: 63.0   3rd Qu.:78.00   3rd Qu.: 88.50   3rd Qu.:33.50  
##  Max.   :100.0   Max.   :99.00   Max.   :100.00   Max.   :61.00  
##      area5            area6       
##  Min.   : 59.00   Min.   : 21.00  
##  1st Qu.: 71.50   1st Qu.: 52.50  
##  Median : 99.00   Median : 67.00  
##  Mean   : 98.73   Mean   : 70.13  
##  3rd Qu.:116.00   3rd Qu.: 86.50  
##  Max.   :163.00   Max.   :125.00
#For this boxplot, we see that about 25% of the readings in area 5 are above the safe hearing
#level of 120 decibels. Those workers in area 5 should definitely have protective earwear.
# One of the readings in area 6 is above the safe hearing level.
# It might be a good idea to provide protective earwear to those workers also in area 6 aswell
# Areas 1-4 appear to be "safe" with respect to hearing level, with area 4 being the safiest.


#---------------Problem 7------------------#

#load library 
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
#Import the data
pizza <- read_excel("C:/Users/user/Downloads/pizza_delivery.xls")
pizza
str(pizza)
## tibble [1,266 x 12] (S3: tbl_df/tbl/data.frame)
##  $ day              : chr [1:1266] "Thursday" "Thursday" "Thursday" "Thursday" ...
##  $ date             : POSIXct[1:1266], format: "2014-05-01" "2014-05-01" ...
##  $ time             : num [1:1266] 35.1 25.2 45.6 29.4 30 ...
##  $ operator         : chr [1:1266] "Laura" "Melissa" "Melissa" "Melissa" ...
##  $ branch           : chr [1:1266] "East" "East" "West" "East" ...
##  $ driver           : chr [1:1266] "Bruno" "Salvatore" "Salvatore" "Salvatore" ...
##  $ temperature      : num [1:1266] 68.3 71 53.4 70.3 71.5 ...
##  $ bill             : num [1:1266] 58.4 26.4 58.1 35.2 38.4 61.8 57.9 35.8 36.6 44.8 ...
##  $ pizzas           : num [1:1266] 4 2 3 3 2 4 3 2 2 5 ...
##  $ free_wine        : num [1:1266] 0 0 1 0 0 1 1 0 0 0 ...
##  $ got_wine         : num [1:1266] 0 0 0 0 0 1 1 0 0 0 ...
##  $ discount_customer: num [1:1266] 1 0 0 0 0 0 0 0 0 0 ...
dim(pizza)
## [1] 1266   12
#Create the boxplot for time 
boxplot(pizza$time)

#fairly symetric, because the median is in the middle of the box
#length of whikser is approximately the same

median(pizza$time)
## [1] 34.38196
#median delivery time = 35

#The boxplot for the delivery time is symmetric with a median delivery time of about 35 minutes
#Most deliveries took between 30-40 minutes. The extreme values(outliers) indicate that there are weresome
#Exceptionally short and long delivery times.

#---------------Problem 8 ------------------#

#Create a vector 
distance = c(12.5,29.9,14.8,18.7,7.6,16.2,16.5,27.4,12.1,17.5)
altitude = c(342,1245,502,555,398,670,796,912,238,466)

#Convert the vector to data frame
a = data.frame(distance,altitude)
a
#a. Calculate the arithmetic mean and median for both distance and altitude. 

b <- c(mean(a$distance), mean(a$altitude))
b
## [1]  17.32 612.40
c <- c(median(a$distance), median(a$altitude))
c
## [1]  16.35 528.50
#b. Determine the first and third quartiles for both the distance and the altitude variables. 
#How much larger is the difference between the median and the first and third quartile?

quantile(a$distance,probs=seq(0,1,0.25),na.rm=FALSE, names=TRUE,type=5)
##    0%   25%   50%   75%  100% 
##  7.60 12.50 16.35 18.70 29.90
quantile(a$altitude,probs=seq(0,1,0.25),na.rm=FALSE, names=TRUE,type=5)
##     0%    25%    50%    75%   100% 
##  238.0  398.0  528.5  796.0 1245.0
#first quartile for the distance = 12.50
#third quartile fr the distance = 18.70
#1st q for alt = 398
#3rd q for alt = 796

#INTERPRETATION FOR THE DISTANCE
#The difference between the median and the first quartile(16-35) is much larger than the difference
#between the median and the third quartile(18.70-16.35), this indicates a distribution that is skewed to
#the left

#c. Create a boxplot. Looking at the distribution, 
#would you say that it is symmetrical? Is there any extreme values?

boxplot(a$distance, horizontal = T,xlab = "distance")

boxplot(a$altitude, horizontal = T,xlab = "altitude")

#-----------------Dataset 1----------------------# 

#a) Create a stem and leaf plot for Science and Math scores. Compare the distribution.

library(readxl)
dataset1 <- read_excel("C:/Users/user/Downloads/dataset1.xlsx")
dataset1
stem(dataset1$Science)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   2 | 6888
##   3 | 00024444
##   3 | 6666688888888
##   4 | 0000022222244444
##   4 | 6666
##   5 | 00002
stem(dataset1$Math)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   1 | 4
##   1 | 6688
##   2 | 000002222224444
##   2 | 666666688
##   3 | 0022444
##   3 | 6688
##   4 | 0222444
##   4 | 68
##   5 | 0
library(aplpack)
## Warning: package 'aplpack' was built under R version 4.1.1
stem.leaf.backback(dataset1$Science,dataset1$Math,m=1)
## _____________________________________________________________________
##   1 | 2: represents 12, leaf unit: 1 
##                 dataset1$Science     dataset1$Math               
## _____________________________________________________________________
##                                 | 1 |46688                       5   
##     4                       8886| 2 |000002222224444666666688  (24)  
##   (21)     888888886666644442000| 3 |00224446688                21   
##   (20)      66664444422222200000| 4 |022244468                  10   
##     5                      20000| 5 |0                           1   
##                                 | 6 |                                
## _____________________________________________________________________
## n:                            50     50                          
## _____________________________________________________________________
#MATH
#Skewed to the right.  By turning the plot on its side, we can see a distribution of the observations is skewed to the right. 
#More scores are located to the left side of the curve.  
#There are more observations with lower scores and very few observations with high scores. 

#SCIENCE
#Fairly Symmetric.

#b) What can be said of student scores in a positively skewed score distribution?

# More students got more lower scores or more students performed poorly

#c) Referring to the data set above, count the number of students who belong to the high school strand 
#   stated in the table then compute for their %s (percentages) with respect to the sample size n (Total).