Question 2: Use R to solve each of the following problems.
# a. The head of quality control stops by a factory to inspect the next 20 widgets produced.
#Over the past six months, 98% of widgets have passed inspection.
#What is the probability that less than 10 widgets pass inspection?
test_size <- 20
test_prob <- 0.98
pbinom(10, size = test_size, prob = test_prob)
## [1] 1.574945e-12
# b. The average test score in a large college statistics course with multiple sections
#is 85 and the standard deviation is five points. If you choose a student at random,
#what is the probability that their score is between 80 and 85?
mean <- 85
sd <- 5
z <- (80-mean)/sd
Probabilty <- pnorm(abs(z))
print(Probabilty)
## [1] 0.8413447
# c. Over the past three years at the farm you manage, the watermelons average about nine
#pounds each with a standard deviation of two pounds. What is the probability that a watermelon
#randomly selected from your harvest will weigh more than 10 pounds?
mean <- 9
sd <- 2
z <- (10-mean)/sd
Probabilty <- 1- pnorm(abs(z))
print(Probabilty)
## [1] 0.3085375
# d. You manage a help desk. 90% of your tickets are resolved with the first reply.
#If you randomly select 10 tickets for review, what is the probability that all 10 were
#resolved with the first reply?
test_size <- 10
test_prob <- 0.90
dbinom(10 , size = test_size , prob = test_prob)
## [1] 0.3486784
Question 3 From the MASS package, import the dataset “immer.”
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("MASS")
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
(immer)
?immer
## starting httpd help server ...
## done
#The dataset is about the yields of a barley field in 2 years 1931 and 1932 in six locations.
#The dataset have variables Loc"Location", Var"Variety in barley", Y1"Yield in 1931", Y2"Yield in 1932".
#It has 30 observations and 4 variables.
#The dataset is not clean as it has 2 year's 1931 and 1932 respectively in the same row
immerdata <- immer
immerdata <- as_tibble(immerdata)
class(immerdata)
## [1] "tbl_df" "tbl" "data.frame"
immerdata <- pivot_longer(
data = immerdata,
cols = c(3:4),
names_to = "year",
values_to = "Yield"
)
#(immerdata)
immerdata$year <-as.factor(immerdata$year)
# i) What values are present in the location variable? How many rows are there for each value?
unique(immerdata$Loc)
## [1] UF W M C GR D
## Levels: C D GR M UF W
#creating a new dataframe of all the yield values
yield <- immer$Y1
for(i in 31:60){
yield[i]<-immer$Y2[i-30]
}
# 10 rows for each value of location
# ii) What values are present in the variety variable? How many rows are there for each value?
unique(immerdata$Var)
## [1] M S V T P
## Levels: M P S T V
summary(immerdata$Var)
## M P S T V
## 12 12 12 12 12
# iii) How many years are represented in the dataset? How many rows are there for each year?
head(immerdata)
unique(immerdata$year)
## [1] Y1 Y2
## Levels: Y1 Y2
# two years data is present in the dataset. Y1 - for 1931 and Y2 for 1932
#summary for yield1
summary(immer$Y1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.10 87.42 102.95 109.05 124.60 191.50
var(immer$Y1)
## [1] 822.2143
sd(immer$Y1)
## [1] 28.67428
#summary for yield2
summary(immer$Y2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 49.90 76.85 92.95 93.13 107.35 147.70
var(immer$Y2)
## [1] 589.2975
sd(immer$Y2)
## [1] 24.27545
# 30 rows for each year
nrow(immer)
## [1] 30
# iv) What is the mean yield?
mean(immerdata$Yield) # 101.09
## [1] 101.09
# v) What is the first quartile, median, and third quartile for yield?
summary(immerdata$Yield)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 49.90 80.62 97.50 101.09 119.72 191.50
quantile(immerdata$Yield)
## 0% 25% 50% 75% 100%
## 49.900 80.625 97.500 119.725 191.500
#first qu: 82.62
#median 97.5
# 3rd quartile 119.72
# i) What are the minimum and maximum values for yield?
# min 49.90
# max 191.50
#ii What are the variance and standard deviation for yield?
var(yield)
## [1] 758.1755
sd(yield)
## [1] 27.53499
#iii) Create two boxplots for yield.
#The first boxplot should have bars that extend from the minimum to the maximum.
boxplot(immerdata$Yield, horizontal = TRUE, range = 0)

#The second boxplot should have bars that extend from (Q1 - 1.5 x IQR) to (Q3 + 1.5 x IQR)
boxplot(immerdata$Yield, horizontal = TRUE, range = 1.5)

Question 4 From the MASS package, import the dataset “Pima.tr.”
?Pima.tr
pimadataset <- Pima.tr
pimadataset <- as_tibble(pimadataset)
(pimadataset)
data(Pima.tr)
unique(Pima.tr)
names(Pima.tr)
## [1] "npreg" "glu" "bp" "skin" "bmi" "ped" "age" "type"
duplicated(Pima.tr)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
glimpse(Pima.tr)
## Rows: 200
## Columns: 8
## $ npreg <int> 5, 7, 5, 0, 0, 5, 3, 1, 3, 2, 0, 9, 1, 12, 1, 4, 1, 11, 1, 0, 2,~
## $ glu <int> 86, 195, 77, 165, 107, 97, 83, 193, 142, 128, 137, 154, 189, 92,~
## $ bp <int> 68, 70, 82, 76, 60, 76, 58, 50, 80, 78, 40, 78, 60, 62, 66, 76, ~
## $ skin <int> 28, 33, 41, 43, 25, 27, 31, 16, 15, 37, 35, 30, 23, 7, 52, 15, 8~
## $ bmi <dbl> 30.2, 25.1, 35.8, 47.9, 26.4, 35.6, 34.3, 25.9, 32.4, 43.3, 43.1~
## $ ped <dbl> 0.364, 0.163, 0.156, 0.259, 0.133, 0.378, 0.336, 0.655, 0.200, 1~
## $ age <int> 24, 55, 35, 26, 23, 52, 25, 24, 63, 31, 33, 45, 59, 44, 29, 21, ~
## $ type <fct> No, Yes, No, No, No, Yes, No, No, No, Yes, Yes, No, Yes, Yes, No~
pimadataset[pimadataset$npreg == 1 & pimadataset$age == 22,]
#It is a dataset of women older than 21 of Pima indian heritage,
#they were tested for diabetes by US National Institute of Diabetes and Digestive and Kidney Diseases.
#The contains of 200 rows and 8 columns, the data is clean and tidy, after checking it has no duplicates or missing values,
#each cell contains one value, each row describes one record and each variable header is not a record it is just a name describing the variable.
#The dataset describes the number of pregnancies, plasma glucose concentration,diastolic blood pressure, triceps skin fold thickness, body max index,
#diabetes pedigree function, age and type of yes or no for diabetic according to WHO criteria for each woman.
#Write a loop that calculates and prints the mean of each numeric variable
for (i in c(2:5)) {
print(mean(pimadataset[[i]]))
}
## [1] 123.97
## [1] 71.26
## [1] 29.215
## [1] 32.31