Question 1: Create three vectors of numbers. For each, print the vector, the mean, the median, the variance, and the standard deviation.

# Vector 1 should have the mean and median equal.
vector1 <- c(40,50,60,70,80)
mean(vector1)

## [1] 60

median(vector1)

## [1] 60

var(vector1)

## [1] 250

sd(vector1)

## [1] 15.81139

#Vector 2 should have the mean greater than the median.

vector2 <- c(40,50,60,130,150)
mean(vector2)

## [1] 86

median(vector2)

## [1] 60

var(vector2)

## [1] 2530

sd(vector2)

## [1] 50.29911

#Vector 3 should have the mean less than the median.

vector3 <- c(10,20,60,65,70)
mean(vector3)

## [1] 45

median(vector3)

## [1] 60

var(vector3)

## [1] 775

sd(vector3)

## [1] 27.83882

Question 2: Use R to solve each of the following problems.

# a. The head of quality control stops by a factory to inspect the next 20 widgets produced. 
#Over the past six months, 98% of widgets have passed inspection. 
#What is the probability that less than 10 widgets pass inspection?
test_size <- 20
test_prob <- 0.98

pbinom(10, size = test_size, prob = test_prob)

## [1] 1.574945e-12

# b. The average test score in a large college statistics course with multiple sections 
#is 85 and the standard deviation is five points. If you choose a student at random, 
#what is the probability that their score is between 80 and 85?
mean <- 85
sd <- 5
z <- (80-mean)/sd
Probabilty <- pnorm(abs(z))
print(Probabilty)

## [1] 0.8413447

# c. Over the past three years at the farm you manage, the watermelons average about nine 
#pounds each with a standard deviation of two pounds. What is the probability that a watermelon 
#randomly selected from your harvest will weigh more than 10 pounds?
mean <- 9
sd <- 2
z <- (10-mean)/sd
Probabilty <- 1- pnorm(abs(z))
print(Probabilty)

## [1] 0.3085375

# d. You manage a help desk. 90% of your tickets are resolved with the first reply. 
#If you randomly select 10 tickets for review, what is the probability that all 10 were 
#resolved with the first reply?

test_size <- 10
test_prob <- 0.90

dbinom(10 , size = test_size , prob = test_prob)

## [1] 0.3486784

Question 3 From the MASS package, import the dataset “immer.”

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library("MASS")

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

(immer)

?immer

## starting httpd help server ...

##  done

#The dataset is about the yields of a barley field in 2 years 1931 and 1932 in six locations.
#The dataset have variables Loc"Location", Var"Variety in barley", Y1"Yield in 1931", Y2"Yield in 1932".
#It has 30 observations and 4 variables.
#The dataset is not clean as it has 2 year's 1931 and 1932 respectively in the same row
immerdata <- immer
immerdata <- as_tibble(immerdata)
class(immerdata)

## [1] "tbl_df"     "tbl"        "data.frame"

immerdata <- pivot_longer(
  data = immerdata,
  cols = c(3:4),
  names_to = "year",
  values_to = "Yield"
)

#(immerdata)

immerdata$year <-as.factor(immerdata$year)

# i) What values are present in the location variable? How many rows are there for each value?

unique(immerdata$Loc)

## [1] UF W  M  C  GR D 
## Levels: C D GR M UF W

#creating a new dataframe of all the yield values
yield <- immer$Y1
for(i in 31:60){
  yield[i]<-immer$Y2[i-30]
}
  # 10 rows for each value of location

# ii) What values are present in the variety variable? How many rows are there for each value?

unique(immerdata$Var)

## [1] M S V T P
## Levels: M P S T V

summary(immerdata$Var)

##  M  P  S  T  V 
## 12 12 12 12 12

# iii) How many years are represented in the dataset? How many rows are there for each year?

head(immerdata)

unique(immerdata$year)

## [1] Y1 Y2
## Levels: Y1 Y2

# two years data is present in the dataset. Y1 - for 1931 and Y2 for 1932

#summary for yield1 
summary(immer$Y1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69.10   87.42  102.95  109.05  124.60  191.50

var(immer$Y1)

## [1] 822.2143

sd(immer$Y1)

## [1] 28.67428

#summary for yield2
summary(immer$Y2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   49.90   76.85   92.95   93.13  107.35  147.70

var(immer$Y2)

## [1] 589.2975

sd(immer$Y2)

## [1] 24.27545

# 30 rows for each year
nrow(immer)

## [1] 30

# iv) What is the mean yield?

mean(immerdata$Yield) # 101.09

## [1] 101.09

# v) What is the first quartile, median, and third quartile for yield?

summary(immerdata$Yield)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   49.90   80.62   97.50  101.09  119.72  191.50

quantile(immerdata$Yield)

##      0%     25%     50%     75%    100% 
##  49.900  80.625  97.500 119.725 191.500

#first qu: 82.62
#median 97.5
# 3rd quartile 119.72

# i) What are the minimum and maximum values for yield?

# min 49.90
# max 191.50

#ii What are the variance and standard deviation for yield?
var(yield)

## [1] 758.1755

sd(yield)

## [1] 27.53499

#iii) Create two boxplots for yield.
#The first boxplot should have bars that extend from the minimum to the maximum.

boxplot(immerdata$Yield, horizontal = TRUE, range = 0)

#The second boxplot should have bars that extend from (Q1 - 1.5 x IQR) to (Q3 + 1.5 x IQR)

boxplot(immerdata$Yield, horizontal = TRUE, range = 1.5)

Question 4 From the MASS package, import the dataset “Pima.tr.”

?Pima.tr

pimadataset <- Pima.tr

pimadataset <- as_tibble(pimadataset)
(pimadataset)

data(Pima.tr)
unique(Pima.tr)

names(Pima.tr)

## [1] "npreg" "glu"   "bp"    "skin"  "bmi"   "ped"   "age"   "type"

duplicated(Pima.tr)

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

glimpse(Pima.tr)

## Rows: 200
## Columns: 8
## $ npreg <int> 5, 7, 5, 0, 0, 5, 3, 1, 3, 2, 0, 9, 1, 12, 1, 4, 1, 11, 1, 0, 2,~
## $ glu   <int> 86, 195, 77, 165, 107, 97, 83, 193, 142, 128, 137, 154, 189, 92,~
## $ bp    <int> 68, 70, 82, 76, 60, 76, 58, 50, 80, 78, 40, 78, 60, 62, 66, 76, ~
## $ skin  <int> 28, 33, 41, 43, 25, 27, 31, 16, 15, 37, 35, 30, 23, 7, 52, 15, 8~
## $ bmi   <dbl> 30.2, 25.1, 35.8, 47.9, 26.4, 35.6, 34.3, 25.9, 32.4, 43.3, 43.1~
## $ ped   <dbl> 0.364, 0.163, 0.156, 0.259, 0.133, 0.378, 0.336, 0.655, 0.200, 1~
## $ age   <int> 24, 55, 35, 26, 23, 52, 25, 24, 63, 31, 33, 45, 59, 44, 29, 21, ~
## $ type  <fct> No, Yes, No, No, No, Yes, No, No, No, Yes, Yes, No, Yes, Yes, No~

pimadataset[pimadataset$npreg == 1 & pimadataset$age == 22,]

#It is a dataset of women older than 21 of Pima indian heritage, 
#they were tested for diabetes by US National Institute of Diabetes and Digestive and Kidney Diseases.
#The contains of 200 rows and 8 columns, the data is clean and tidy, after checking it has no duplicates or missing values,
#each cell contains one value, each row describes one record and each variable header is not a record it is just a name describing the variable.
#The dataset describes the number of pregnancies, plasma glucose concentration,diastolic blood pressure, triceps skin fold thickness, body max index,
#diabetes pedigree function, age and type of yes or no for diabetic according to WHO criteria for each woman.

#Write a loop that calculates and prints the mean of each numeric variable

for (i in c(2:5)) {
  print(mean(pimadataset[[i]]))
}

## [1] 123.97
## [1] 71.26
## [1] 29.215
## [1] 32.31

4.18 Problem Set Statistics

Haddy Loum

Question 1: Create three vectors of numbers. For each, print the vector, the mean, the median, the variance, and the standard deviation.

Question 2: Use R to solve each of the following problems.

Question 3 From the MASS package, import the dataset “immer.”

Question 4 From the MASS package, import the dataset “Pima.tr.”