CIAM 6256: Problem Set for Assignment 1.

#P1. Measurement of pH values of 44 lakes in a region results in the following data set 4.59 ,4.58, 4.96, 5.47, 6.67, 5.60, 5.97, 6.64, 4.99, 4.63, 4.60, 4.97, 4.80, 5.31, 4.87, 6.06,4.93, 4.68, 6.15, 5.31, 4.47, 4.85, 4.32, 4.72, 5.42, 5.87, 5.38, 5.60, 5.07, 4.82, 6.26, 4.60, 5.06, 4.97, 4.53, 5.72, 6.27, 5.41, 6.72 , 6.23, 5.42, 5.99, 4.88, 5.97

pH <- c(4.59 ,4.58, 4.96, 5.47, 6.67, 5.60, 5.97, 6.64, 4.99, 4.63, 4.60, 4.97, 4.80, 5.31, 4.87, 6.06,4.93, 4.68, 6.15, 5.31, 4.47, 4.85, 4.32, 4.72, 5.42, 5.87, 5.38, 5.60, 5.07, 4.82, 6.26, 4.60, 5.06, 4.97, 4.53, 5.72, 6.27, 5.41, 6.72 , 6.23, 5.42, 5.99, 4.88, 5.97) #the ph values were assigned to a vector/concatenated

##(a) Find the arithmetic mean, median, standard deviation.

meanph<- mean(pH) #Calculating mean value

medph <-  median(pH) #obtaining the median

sdph <- sd(pH) #obtaining the standard deviation 
cat("The arithmetic mean of the data set is ", meanph) 
## The arithmetic mean of the data set is  5.325682
cat("The median of the data set is .", medph) 
## The median of the data set is . 5.19
cat("The standard deviation of the data set is ", sdph) 
## The standard deviation of the data set is  0.6639291

##(b) Histogram: Plot the frequency distribution with appropriate breaks.

hist(pH, breaks = seq(4, 7, by = 0.5), col="rosybrown2", xlab="pH values", main="Histogram of pH values of 44 lakes", ylim = c(0,20)) #Creation of a histogram to view the frequency distribution

###The histogram was created using the pH values, the most frequent values are 4.5-5.0 pH. The result shows a non symmetrical histogram skewed to the right.

##(c) Plot the pie graph of the relative frequency.

f <-  hist(pH, breaks = seq(4, 7, by = 0.5), plot=FALSE) #referencing histogram without showing/plotting the graph
f
## $breaks
## [1] 4.0 4.5 5.0 5.5 6.0 6.5 7.0
## 
## $counts
## [1]  2 18  9  7  5  3
## 
## $density
## [1] 0.09090909 0.81818182 0.40909091 0.31818182 0.22727273 0.13636364
## 
## $mids
## [1] 4.25 4.75 5.25 5.75 6.25 6.75
## 
## $xname
## [1] "pH"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
breaks <- seq(4, 7, by = 0.5) #assigning breaks
breaks 
## [1] 4.0 4.5 5.0 5.5 6.0 6.5 7.0
freq <- f$counts #obtaining frequency
freq
## [1]  2 18  9  7  5  3
relf <- freq/length(pH) #obtaining relative frequency
relf
## [1] 0.04545455 0.40909091 0.20454545 0.15909091 0.11363636 0.06818182
 #Creating a pie chart 
lbb=paste(f$breaks[-length(f$breaks)],"-",f$breaks[-1])
pie(relf, labels=lbb, col=heat.colors(length(relf)), main= "Relative Frequency of pH values in 44 lakes")

###The pie chart of pH relative frequency shows that most of the water samples had a pH of 4.5-5.0 followed by pH of 5-5.5, followed by pH of 5.5-6. This indicates that the samples with the higher frequency are slightly acidic.

#P2. The following data represent the acidity of 40 successive rainfalls in the state of Minnesota. The acidity is measured on a pH scale, which varies from 1 (very acidic) to 7 (neutral).3.71, 4.23, 4.16, 2.98, 3.23, 4.67 , 3.99, 5.04, 4.55, 3.24, 2.8, 3.44, 3.27, 2.66, 2.95, 4.7, 5.12,3.77, 3.12, 2.38, 4.57, 3.88, 2.97, 3.7, 2.53, 2.67, 4.12, 4.8, 3.55, 3.86, 2.51, 2.33, 3.85, 2.35, 3.12, 4.39,5.09, 3.38, 2.73, 3.07

Rainfall <- c(3.71, 4.23, 4.16, 2.98, 3.23, 4.67 , 3.99, 5.04, 4.55, 3.24, 2.8, 3.44, 3.27, 2.66, 2.95, 4.7, 5.12,3.77, 3.12, 2.38, 4.57, 3.88, 2.97, 3.7, 2.53, 2.67, 4.12, 4.8, 3.55, 3.86, 2.51, 2.33, 3.85, 2.35, 3.12, 4.39,5.09, 3.38, 2.73, 3.07)
#the ph values were assigned to a vector/concatenated

##(a) Find the sample mean, median, variance, and standard deviation.

meanrain= mean(Rainfall) #obtaining the sample mean
medianrain=median(Rainfall) #obtaining the median of the sample
varrain=var(Rainfall) #obtaining the variance of the sample
sdrain=sd(Rainfall) #obtaining the standard deviation of the sample

cat("The arithmetic mean of the data set is ", meanrain) 
## The arithmetic mean of the data set is  3.587
cat("The median of the data set is .", medianrain) 
## The median of the data set is . 3.495
cat("The variance of the data set is .", varrain) 
## The variance of the data set is . 0.6869703
cat("The standard deviation of the data set is ",sdrain) 
## The standard deviation of the data set is  0.8288367

##(b) Find and plot the frequency table.

freq_tab <- table(cut(Rainfall, breaks = 8))
freq_tab
## 
## (2.33,2.68] (2.68,3.03] (3.03,3.38] (3.38,3.73] (3.73,4.07] (4.07,4.42] 
##           7           5           6           5           5           4 
## (4.42,4.77] (4.77,5.12] 
##           4           4
hist(Rainfall, xlab="Acidity of rainfall in Minnesota (pH)", ylab="Frequency", main="Histogram of pH values of 40 successuve rainfalls in Minnesota", col="mediumpurple", ylim=c(0,10), xlim=c(0, 6)) #Frequency plot (histogram)

cat("The highest frequency occurs between pH 2.5 and 4.0 which means that most rainfall events are acidic. The histogram appears slightly skewed to the right, this indicates thatmore rainfall events are more acid (lower pH) compared to basic (higher pH).") 
## The highest frequency occurs between pH 2.5 and 4.0 which means that most rainfall events are acidic. The histogram appears slightly skewed to the right, this indicates thatmore rainfall events are more acid (lower pH) compared to basic (higher pH).

#P3. The following table gives yearly per capita soft drink consumption (in litres) and the yearly per capita milk consumption (in kg) for a variety of countries. Use it to find the sample correlation coefficient between soft drink and milk consumption.

Country soft drink mild United States 216 254 Australia 100 233 Switzerland 81 308 France 37 256 United Kingdom 97 230 The Netherlands 96 329 New Zealand 84 210 Germany 72 314 Italy 50 239 Japan 22 68

##(a) Plot the scattered chart of per-capita soft drink against milk.

library(readr)
SoftDrink <- read_csv("SoftDrink.csv")
#View(SoftDrink)
print(SoftDrink)
## # A tibble: 10 × 3
##    Country         `soft drink`  milk
##    <chr>                  <dbl> <dbl>
##  1 United States            216   254
##  2 Australia                100   233
##  3 Switzerland               81   308
##  4 France                    37   256
##  5 United Kingdom            97   230
##  6 The Netherlands           96   329
##  7 New Zealand               84   210
##  8 Germany                   72   314
##  9 Italy                     50   239
## 10 Japan                     22    68
softdrink=SoftDrink$`soft drink`
milk=SoftDrink$milk
plot(softdrink, milk, main = "Soft Drink vs Milk Consumption Per Capita for a Variety of Countries", xlab = "Soft Drink (litres)", ylab = "Milk (kg)", pch = 19, col = "skyblue2", ylim= c(20,370) , xlim=c(10,270))+text(softdrink, milk, labels = SoftDrink$Country, pos = 4, cex = 0.7) 

## integer(0)

##(b) Calculate the correlation between per-capita soft drinking and milk consumptions.

corsd=cor(softdrink, milk) #Correlation between soft drink and milk consumption 
cat("The correlation between per-capita soft drinking and milk consumptions ",corsd) 
## The correlation between per-capita soft drinking and milk consumptions  0.3256602
cat("This correlation is positive which means that as one variable (soft drink) increases so does the other variable (milk consumption)")
## This correlation is positive which means that as one variable (soft drink) increases so does the other variable (milk consumption)

#P4. In the study to investigate the effects of river dam on fish migration, 40 fish were marked and released before they approached the dam. The previous studies indicate that on average there is a 56% chance for the fish to pass the dam, and out of those passed the dam, only 75% chance for the fish to get recaptured. Assume passing the dam and recapture are two independent processes.

##(a) Calculate the probability of the possible numbers of fish recaptured.

n=40 #number of fish
pdam=0.56 #probability of fishes passing the dam
prec=0.75#probability of fishes getting recaptured
x=0:n #prob of # of recaptured fish
pall=pdam*prec #Probability of a fish getting recaptured after passing the dam
pall
## [1] 0.42
pfrec=dbinom(x,n,pall) #The probability of fishes from 0-40 of being recaptured
pfrec
##  [1] 3.444449e-10 9.977024e-09 1.408825e-07 1.292232e-06 8.655728e-06
##  [6] 4.512918e-05 1.906319e-04 6.704983e-04 2.002825e-03 5.156698e-03
## [11] 1.157590e-02 2.286149e-02 4.000761e-02 6.239913e-02 8.714362e-02
## [16] 1.093803e-01 1.237600e-01 1.265214e-01 1.170686e-01 9.815918e-02
## [21] 7.463482e-02 5.147229e-02 3.219035e-02 1.824281e-02 9.357303e-03
## [26] 4.336626e-03 1.811720e-03 6.802628e-04 2.287090e-04 6.853113e-05
## [31] 1.819620e-05 4.250502e-06 8.656733e-07 1.519677e-07 2.265645e-08
## [36] 2.812524e-09 2.828688e-10 2.214444e-11 1.265971e-12 4.701217e-14
## [41] 8.510824e-16
cat("The probability of recapturing a fish after passing the dam is ",pfrec) 
## The probability of recapturing a fish after passing the dam is  3.444449e-10 9.977024e-09 1.408825e-07 1.292232e-06 8.655728e-06 4.512918e-05 0.0001906319 0.0006704983 0.002002825 0.005156698 0.0115759 0.02286149 0.04000761 0.06239913 0.08714362 0.1093803 0.12376 0.1265214 0.1170686 0.09815918 0.07463482 0.05147229 0.03219035 0.01824281 0.009357303 0.004336626 0.00181172 0.0006802628 0.000228709 6.853113e-05 1.81962e-05 4.250502e-06 8.656733e-07 1.519677e-07 2.265645e-08 2.812524e-09 2.828688e-10 2.214444e-11 1.265971e-12 4.701217e-14 8.510824e-16
barplot(pfrec, ylab="Frequency", xlab="Probability of fish being recaptured after dam", main="Probability of recapturing a certain number of fishes", names=as.character(x)) #Plotting the frequency to get a better view and interpret the data

###The probability of recapturing a fish after passing the dam decreases after 18. 17 has the highest frequency which indicates this is the highest probability of recapturing a fish after passing the dam.

##(b) Calculate the mean and variance of the number of fish recaptured.

meanrec=mean(n*pall) #Mean of fish recaptured
meanrec
## [1] 16.8
varrec=n*pall*(1-pall) #Variance of fish being recaptured
varrec
## [1] 9.744
cat("The mean of fish recaptured is ", round(meanrec,3), "and", "the variance of fish recaptured is  ",varrec) 
## The mean of fish recaptured is  16.8 and the variance of fish recaptured is   9.744

#P5. The past data show that 1 out of 90 deaths is caused by motor vehicle accident in the US. 220 death cases are randomly sampled. Use the Poisson distribution to approximate the probability of the number of deaths caused by motor vehicle accident.

n_d=220 #220 death cases
p_d=0.01 #1/90 deaths caused by vehicles

#Poisson
lambda=n_d*p_d #Poisson mean

##(a) Find out the probabilities that: none of the 220 were died of motor vehicle accidents.

p_0=dpois(0,lambda) #calculating prob that none of the 220 death cases died because of a motor vehicle accident

cat("The probabilities that none of the 220 death cases died of motor vehicle accidents is",p_0) 
## The probabilities that none of the 220 death cases died of motor vehicle accidents is 0.1108032

##(b) Find the probability that three or more are due to motor vehicle accidents.

p_3 =1-ppois(2,lambda) #poisson cummulatoive distribution because x is greater than 2
p_3 
## [1] 0.3772863
cat("The probability that three or more deaths are due to motor vehicle accidents is ",p_3) 
## The probability that three or more deaths are due to motor vehicle accidents is  0.3772863

##(c) Find and plot the density function of the random variable X (the number of deaths of motor vehicles) using both Binomial and Poisson distributions.

n_d=220 #220 death cases
p_d=0.01 #1/90 deaths caused by vehicles
sd_d=sqrt(n_d*p_d*(1-p_d)) #Calculus to obtain the standard deviation, this determines a useful range for x
xmin=max(round(lambda - 4*sd_d), 0) #Calculating minimum for x
xmax=round(lambda + 4*sd_d) #Calculating maximum for x
x_de= xmin:xmax #Range of numbers to find the prob


pd_binom <- dbinom(x = x_de, size = n_d, prob = p_d) # Binomial density PMF
barplot(pd_binom, names.arg = as.character(x_de), 
        xlab = "Number of deaths due to motor vehicles", 
        ylab = "Probability", main="Binomial Probability Distribution")

poi_den<- dpois(x_de, lambda)
poi_den #Poisson Density
## [1] 0.110803158 0.243766948 0.268143643 0.196638672 0.108151269 0.047586559
## [7] 0.017448405 0.005483784 0.001508041
barplot(poi_den, names.arg=as.character(x_de),
        xlab="Number of deaths due to motor vehicles",
        ylab="Probability", main="Probability Distribution (Poisson)")

pf_combined <- t(cbind(pd_binom, poi_den)) # Combined into a matrix then transpose
barplot(pf_combined, names.arg = as.character(x_de), 
        col = c("lemonchiffon", "darkseagreen"), beside = TRUE,
        legend.text = c("Binomial", "Poisson"), 
        xlab="Number of deaths due to motor vehicles",
        ylab="Probability", main="Binomial vs. Poisson Probability Distribution")