##Oral and Hands-on Exam on Page 111 and 112
#Problem 6
#Create a vector
area1 = c(30,12,35,65,24,59,68,57,100,61,32,45,92,56,44)
area2 = c(64,99,87,59,23,16,94,78,57,32,52,78,59,55,55)
area3 = c(100,59,78,97,84,64,53,59,89,88,94,66,57,62,64)
area4 = c(25,15,30,20,61,56,34,22,24,21,32,52,14,10,33)
area5 = c(59,63,81,110,65,112,132,145,163,120,84,99,105,68,75)
area6 = c(67,80,99,49,67,56,80,125,100,93,56,45,80,34,21)
#Create an individual boxplot for each area
boxplot(area1, horizontal = TRUE)

boxplot(area2, horizontal = TRUE)

boxplot(area3, horizontal = TRUE)

boxplot(area4, horizontal = TRUE)

boxplot(area5, horizontal = TRUE)

boxplot(area6, horizontal = TRUE)

#Checking the length of the value of each area
length(area1)
## [1] 15
length(area2)
## [1] 15
length(area3)
## [1] 15
length(area4)
## [1] 15
length(area5)
## [1] 15
length(area6)
## [1] 15
#Coverting the vector to dataframe
areadf = data.frame(area1,area2,area3,area4,area5,area6)
areadf
#Combining the boxplot
boxplot(areadf,col="blue",horizontal = TRUE, xlab="All Area Noise Level")

summary(areadf)
## area1 area2 area3 area4
## Min. : 12.0 Min. :16.00 Min. : 53.00 Min. :10.00
## 1st Qu.: 33.5 1st Qu.:53.50 1st Qu.: 60.50 1st Qu.:20.50
## Median : 56.0 Median :59.00 Median : 66.00 Median :25.00
## Mean : 52.0 Mean :60.53 Mean : 74.27 Mean :29.93
## 3rd Qu.: 63.0 3rd Qu.:78.00 3rd Qu.: 88.50 3rd Qu.:33.50
## Max. :100.0 Max. :99.00 Max. :100.00 Max. :61.00
## area5 area6
## Min. : 59.00 Min. : 21.00
## 1st Qu.: 71.50 1st Qu.: 52.50
## Median : 99.00 Median : 67.00
## Mean : 98.73 Mean : 70.13
## 3rd Qu.:116.00 3rd Qu.: 86.50
## Max. :163.00 Max. :125.00
#For this boxplot, we see that about 25% of the readings in area 5 are above the safe hearing
#level of 120 decibels. Those workers in area 5 should definitely have protective earwear.
# One of the readings in area 6 is above the safe hearing level.
# It might be a good idea to provide protective earwear to those workers also in area 6 aswell
# Areas 1-4 appear to be "safe" with respect to hearing level, with area 4 being the safiest.
#---------------Problem 7------------------#
#load library
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
#Import the data
pizza <- read_excel("C:/Users/user/Downloads/pizza_delivery.xls")
pizza
str(pizza)
## tibble [1,266 x 12] (S3: tbl_df/tbl/data.frame)
## $ day : chr [1:1266] "Thursday" "Thursday" "Thursday" "Thursday" ...
## $ date : POSIXct[1:1266], format: "2014-05-01" "2014-05-01" ...
## $ time : num [1:1266] 35.1 25.2 45.6 29.4 30 ...
## $ operator : chr [1:1266] "Laura" "Melissa" "Melissa" "Melissa" ...
## $ branch : chr [1:1266] "East" "East" "West" "East" ...
## $ driver : chr [1:1266] "Bruno" "Salvatore" "Salvatore" "Salvatore" ...
## $ temperature : num [1:1266] 68.3 71 53.4 70.3 71.5 ...
## $ bill : num [1:1266] 58.4 26.4 58.1 35.2 38.4 61.8 57.9 35.8 36.6 44.8 ...
## $ pizzas : num [1:1266] 4 2 3 3 2 4 3 2 2 5 ...
## $ free_wine : num [1:1266] 0 0 1 0 0 1 1 0 0 0 ...
## $ got_wine : num [1:1266] 0 0 0 0 0 1 1 0 0 0 ...
## $ discount_customer: num [1:1266] 1 0 0 0 0 0 0 0 0 0 ...
dim(pizza)
## [1] 1266 12
#Create the boxplot for time
boxplot(pizza$time)

#fairly symetric, because the median is in the middle of the box
#length of whikser is approximately the same
median(pizza$time)
## [1] 34.38196
#median delivery time = 35
#The boxplot for the delivery time is symmetric with a median delivery time of about 35 minutes
#Most deliveries took between 30-40 minutes. The extreme values(outliers) indicate that there are weresome
#Exceptionally short and long delivery times.
#---------------Problem 8 ------------------#
#Create a vector
distance = c(12.5,29.9,14.8,18.7,7.6,16.2,16.5,27.4,12.1,17.5)
altitude = c(342,1245,502,555,398,670,796,912,238,466)
#Convert the vector to data frame
a = data.frame(distance,altitude)
a
#a. Calculate the arithmetic mean and median for both distance and altitude.
b <- c(mean(a$distance), mean(a$altitude))
b
## [1] 17.32 612.40
c <- c(median(a$distance), median(a$altitude))
c
## [1] 16.35 528.50
#b. Determine the first and third quartiles for both the distance and the altitude variables.
#How much larger is the difference between the median and the first and third quartile?
quantile(a$distance,probs=seq(0,1,0.25),na.rm=FALSE, names=TRUE,type=5)
## 0% 25% 50% 75% 100%
## 7.60 12.50 16.35 18.70 29.90
quantile(a$altitude,probs=seq(0,1,0.25),na.rm=FALSE, names=TRUE,type=5)
## 0% 25% 50% 75% 100%
## 238.0 398.0 528.5 796.0 1245.0
#first quartile for the distance = 12.50
#third quartile fr the distance = 18.70
#1st q for alt = 398
#3rd q for alt = 796
#INTERPRETATION FOR THE DISTANCE
#The difference between the median and the first quartile(16-35) is much larger than the difference
#between the median and the third quartile(18.70-16.35), this indicates a distribution that is skewed to
#the left
#c. Create a boxplot. Looking at the distribution,
#would you say that it is symmetrical? Is there any extreme values?
boxplot(a$distance, horizontal = T,xlab = "distance")

boxplot(a$altitude, horizontal = T,xlab = "altitude")

#-----------------Dataset 1----------------------#
#a) Create a stem and leaf plot for Science and Math scores. Compare the distribution.
library(readxl)
dataset1 <- read_excel("C:/Users/user/Downloads/dataset1.xlsx")
dataset1
stem(dataset1$Science)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 2 | 6888
## 3 | 00024444
## 3 | 6666688888888
## 4 | 0000022222244444
## 4 | 6666
## 5 | 00002
stem(dataset1$Math)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 1 | 4
## 1 | 6688
## 2 | 000002222224444
## 2 | 666666688
## 3 | 0022444
## 3 | 6688
## 4 | 0222444
## 4 | 68
## 5 | 0
library(aplpack)
## Warning: package 'aplpack' was built under R version 4.1.1
stem.leaf.backback(dataset1$Science,dataset1$Math,m=1)
## _____________________________________________________________________
## 1 | 2: represents 12, leaf unit: 1
## dataset1$Science dataset1$Math
## _____________________________________________________________________
## | 1 |46688 5
## 4 8886| 2 |000002222224444666666688 (24)
## (21) 888888886666644442000| 3 |00224446688 21
## (20) 66664444422222200000| 4 |022244468 10
## 5 20000| 5 |0 1
## | 6 |
## _____________________________________________________________________
## n: 50 50
## _____________________________________________________________________
#MATH
#Skewed to the right. By turning the plot on its side, we can see a distribution of the observations is skewed to the right.
#More scores are located to the left side of the curve.
#There are more observations with lower scores and very few observations with high scores.
#SCIENCE
#Fairly Symmetric.
#b) What can be said of student scores in a positively skewed score distribution?
# More students got more lower scores or more students performed poorly
#c) Referring to the data set above, count the number of students who belong to the high school strand
# stated in the table then compute for their %s (percentages) with respect to the sample size n (Total).