Data Analytics - Boston College Summer 2021 Thomas L. ========================================================

#1.
 #The population space is 52 card
#There are 13 hearts in a standard deck, of which 3 are face cards (10-3)
round(10/52,4)
## [1] 0.1923
#2. 
#There are 36 permutations of rolls with two 6-sided dice. 
#P(sum < 6) {1,1}, {1,2}, {1,3}, {1,4}, {2,1}, {2,2}, {2,3}, {3,1}, {3,2}, {4,1} = 10 success
round(10/36,4)
## [1] 0.2778
 Q_2=sample(seq(1:6),100000,replace=TRUE)+sample(seq(1:6),100000,replace=TRUE)
 Q_2_answer=round(length(Q_2[Q_2<6])/100000,2)
 Q_2_answer
## [1] 0.28
 round(10/36,4)
## [1] 0.2778
#3. 
#The denominator equals the sample space of 2001
#The numerator in the proportion is all males across residential customer base
#P(male)=964/2001 = 0.4818

Q3_mat=matrix(c(233,208,159,138,102,280,220,265,250,146), byrow=TRUE, nrow=5)
colnames(Q3_mat)=c("Male", "Female")
rownames(Q3_mat)=c("Apartment","Dorm", "With Parent(s)","Sorority/Fraternity House","Other")
Q3_mat
##                           Male Female
## Apartment                  233    208
## Dorm                       159    138
## With Parent(s)             102    280
## Sorority/Fraternity House  220    265
## Other                      250    146
round(sum(Q3_mat[,1])/sum(Q3_mat),4)
## [1] 0.4818
#4. 
#P(Club) and P(B) and P(Face card)
#P(Club) * P(B) * P(Face card)
#P(Club) = 13 success / 52 possibilities
#P(B) = 26 success / 52 possibilities
#P(Face card) = 12 success / 52 possibilities  -- 3 of  on firsteach suit
round((13/52)*(26/52)*(12/52),4)
## [1] 0.0288
0.0288
## [1] 0.0288
#5. 
#P(A = Spade | B = heart on second)
#P(heart) * P(spade|heart)
round((13/52)*(13/51),4)
## [1] 0.0637
0.0637
## [1] 0.0637
#6. 
#P(A = R on first | B = Heart on second)
#P(Heart) * P(R|H)
round((13/52)*(25/51),4)
## [1] 0.1225
#0.1225

#7.
4/85*(12/84)
## [1] 0.006722689
#P(A=Junior Female on first, B= Freshman Male on second), no replacement 
Q7_mat=matrix(c(12,12,19,15,12,4,7,4), byrow=TRUE, nrow=4)
colnames(Q7_mat)=c("Male", "Female")
rownames(Q7_mat)=c("Freshman","Sophomore", "Junior","Senior")
Q7_mat
##           Male Female
## Freshman    12     12
## Sophomore   19     15
## Junior      12      4
## Senior       7      4
Q7_Prob = ((sum(Q7_mat[3,2]))/sum(Q7_mat))*((sum(Q7_mat[1,1]))/(sum(Q7_mat)-1))
round(Q7_Prob,4)
## [1] 0.0067
#8. 
#Step1: P(B = Grad Degree | A = Male)
Q7_Grad_and_Male = 52/300
Q7_Male = 141/300
round(Q7_Grad_and_Male/Q7_Male,4)
## [1] 0.3688
#0.03688

#Step2: P(A = Male | B = Grad Degree) = P(B|A)*P(A) / P(B)
Q7_Grad = 102/300
round((Q7_Grad_and_Male/Q7_Male)*Q7_Male/Q7_Grad,4)
## [1] 0.5098
#0.5098

#9. 
#Assuming the value meal combination selections are independent of each other
#the population space would equal 6 drinks options x 5 sandwich types x 3 chips
6 * 5 * 3
## [1] 90
#90

#10. 
#The total number of ways a doctor could visit her patients is a factorial of the number of patients
factorial(5)
## [1] 120
#The doctor can visit all 5 of her patients 120 different ways

#11.
#The coordinator will be selecting factorial of songs bound by 8 and 3 options
#Permutations = factorial Upper bound / factorial (upper - lower bound)
factorial(8)/factorial(3)
## [1] 6720
#The coordinator will have 6,720 ways in which he could order the song list

#12. 
#Total permutation space is numerator = 9!
#The object steps are the number of times we want to select fours (3), sixes (5), and twos (1) - ordering doesn’t matter
factorial(9)/(factorial(3)*factorial(5)*factorial(1))
## [1] 504
#504 possible ways to roll a day with those number combinations

#13.
#For unordered samples, we need to define the number of subjects (k) from pop (n)
#k = 6 toppings selected, n= 14 possible topping choices
#n!/[k!(n − k)!], without replacement
n=14
k=6
Q13_combo = factorial(n)/(factorial(k)*factorial(n-k))
Q13_combo
## [1] 3003
#3003 possible

#14. 
#For unordered samples, we need to define the number of subjects (k) from pop (n)
#k = 3 cards selected, n= 52 possible cards
#n!/[k!(n − k)!], without replacement
n=52
k=3
Q14_combo = factorial(n)/(factorial(k)*factorial(n-k))
Q14_combo
## [1] 22100
#22100 possible 3 card combos

#15. 
#This is a simple permutation in which the tv, surround sound, and DVD player are independent
#Selection 1 (tv) -> 12 choices * Selection 2(Surround sound) -> 9 choices * Selection 3(DVD player) -> 5 choices
12*9*5
## [1] 540
#540 possible home theatre combinations

#16. 
#This is an ordered sample without replacement
#There are 26 possible letters (A-Z)
#There are 5 odd digits between 0-9 (1,3,5,7,9)
n_letters=26
k_letters=5
n_numbers=5
k_numbers=3
#n!/(n-k)!
Q16_letters=factorial(n_letters)/factorial(n_letters-k_letters)
Q16_numbers=factorial(n_numbers)/factorial(n_numbers-k_numbers)
Q16_letters*Q16_numbers
## [1] 473616000
#473,616,000 possible password combinations




#17. 
#Permutations = factorial Upper bound / factorial (upper - lower bound)
#9!/(9!-4!)
factorial(9)/factorial(5)
## [1] 3024
#3024

#18
#Combination assumes ordering does not matter
#n!/[k!(n − k)!]
n=11
k=8
Q_18_combo = factorial(n)/(factorial(k)*factorial(n-k))
Q_18_combo
## [1] 165
#165 possible combos


#19. 
#Permutation = n!/[k!(n − k)!] , n=12, k=8
#Combination = n!/[k!(n − k)!], n=12, k=4
n_P=12
k_P=8
Q19_P = (factorial(n_P)/factorial(n_P-k_P))
n_C=12
k_C=4
Q19_C = (factorial(n_C))/(factorial(k_C)*factorial(n_C-k_C))
Q19_P/Q19_C
## [1] 40320
#40320 possibilities

#20.
#The President’s decision represents an order permutation without replacement
#n=13 cabinet candidates, k= 7 cabinet spots
#n!/(n-k)!
Q20_n=13
Q20_k=7
Q20 = factorial(Q20_n)/(factorial(Q20_n-Q20_k))
Q20
## [1] 8648640
#8648640 possible cabinet combinations

#21. 
#”Population” has 10 letters (n)
#2 letters (“p” and “o”) repeat in the word and need to be selected twice, so this problem is similar to Q12
#The other 6 letters (u,l,a,t,i,n) would be selected only once, and factorial(1) = 1
factorial(10)/(factorial(2)*factorial(2))
## [1] 907200
#907200 possible orderings

#22. 
#Step 1
X_Q22= c(5,6,7,8,9)
PX_Q22 = c(0.1,0.2,0.3,0.2,0.2)
mean_Q22=sum(X_Q22* PX_Q22)
round(mean_Q22,1)
## [1] 7.2
#7.2

#Step 2
#Variance equals squared distance of each X value from the mean multiplied by relative weight
var_Q22= sum((X_Q22-mean_Q22)^2 * PX_Q22)
round(var_Q22,1)
## [1] 1.6
#1.6 variance

#Step 3
std_Q22 = sqrt(var_Q22)
round(std_Q22,1)
## [1] 1.2
#1.2

#Step 4
#No values greater than 9, but at 9, probability is 0.2
#0.2, or 20% probability X is 9 or greater

#Step 5
#P(X<=7) = P(X=7) + P(X=6) + P(X=5)
0.3+0.2+0.1
## [1] 0.6
#0.6, or 60% probability less than or equal to 7

#23.
#Probability of making a free throw equals samples made of sample attempts
FT_made=188/376
#P(C = FT_made on attempt 3| B = FT_made on attempt 2| A = FT_made on attempt 1)
Prob_win = FT_made^3
#Step 1
Q23_EV = 23*Prob_win + (-4*(1-Prob_win))
round(Q23_EV,2)
## [1] -0.62
#-$0.62 per challenge

#Step 2
#Expected value of 994 games = Q23_EV * 994
round(Q23_EV * 994,2)
## [1] -621.25
#-621.25 loss after 994 attempts

#24. 
#This is a good case for binomial modeling because coin flips are 1) independent, we have a set number of trials, and the probability of success remains constant throughout
#Step 1
Q24_Prob = pbinom(8,11,0.5)
Q24_Prob
## [1] 0.9672852
Q24_EV = 1*Q24_Prob + (-7*(1-Q24_Prob))
round(Q24_EV,2)
## [1] 0.74
#0.74 win per 11 coin flips

#Step 2
#EV of 615 games = Q24_EV * 615
round(Q24_EV *615,2)
## [1] 454.04
#454.04 expected win value of 615 games

#25.
#Step 1
#P(B= Club on selection 2| A= Club on selection 1), win 583
#-P(B= Club on selection 2| A= Club on selection 1), lose 35
Q25_Prob=(13/52)*(12/51)
Q25_Prob
## [1] 0.05882353
Q25_EV=(583*Q25_Prob) + (-35*(1-Q25_Prob))
round(Q25_EV,2)
## [1] 1.35
#Win 1.35 cents per drawing

#Step 2
#EV of 632 drawings
round(Q25_EV*632,2)
## [1] 855.06
#$855.06 in winnings

#26. #P(x<= 2), where N=10, pi = 0.3
round(pbinom(2,10,0.3),3)
## [1] 0.383
#0.383 or 38.3% that 2 or fewer light bulbs would be defective in sample of 10

#27. 
#The expected value of binomial distribution is just chance of success x number of samples
#N= 5, pi = 0.3 → EV = N*pi
Q27_EV=5*0.3
Q27_EV
## [1] 1.5
#1.5 bulbs expected to be defective out of 5 samples

#28.
#P(x > 5), where lambda = 5.5
#Poisson model makes the most sense here because we do not have a specific N
#We will use some arbitrarily large upper bound to capture all probabilities greater than 5
Q28_Prob= sum(dpois(6:100000,5.5))
round(Q28_Prob,4)
## [1] 0.4711
#0.4711, or 47.11% that more than 5 special orders sent on a given day

#29.
#P(x > 4), where lambda = 5.7
#Poisson model makes the most sense here because we do not have a specific N
#We will use some arbitrarily large upper bound to capture all probabilities greater than 5
Q29_Prob= sum(dpois(5:100000,5.7))
round(Q29_Prob,4)
## [1] 0.6728
#0.6728, or 67.28% that more than 4 customers will arrive at the drive-thru in a given hour

#30. 
#P(x<=1), where N=7 and PI = 0.4
Q30_Prob = pbinom(1,7,0.4,TRUE)
round(Q30_Prob,4)
## [1] 0.1586
#0.1586, or 15.86% that the machine will crash in a given week

#31. 
#This represents a hypergeometric distribution because we have dependent trials, without replacement
#P(X(over 50 worked fired) > 1), 
#then X ∼ hyper(m =M, n = N, k = n) and has mean and variance
#x= test number fired from >50 (1), m = number of over 50 employees in pop (6), n= not over 50 (19), k = employees fired (8) 
X_Q31 = 1
m_Q31=6
n_Q31=19
k_Q31=8
#We want the upper tail, not lower
Prob_Q31=phyper(X_Q31,m_Q31,n_Q31,k_Q31,FALSE)
round(Prob_Q31,3)
## [1] 0.651
#0.651, or 65.1% that more than 1 over 50 employee was fired

#32.
#This represents a hypergeometric distribution because we have dependent trials, without replacement
#P(X(patients dead) < 7), 
#then X ∼ hyper(m =M, n = N, k = n) and has mean and variance
#X= 6, m = 10 , n= 15, k = 8 
#X cannot equal 7 or it would be inclusive of 7 deaths on the lower tail
X_Q32 = 6
m_Q32=10
n_Q32=15
k_Q32=8
Prob_Q32=phyper(X_Q32,m_Q32,n_Q32,k_Q32)
round(Prob_Q32,3)
## [1] 0.998
#0.998, or 99.8% probability that there will be fewer than 7 deaths

#33.
#P(X>979), where mu=1300, std = Sqrt(40000)
SD_Q33=sqrt(40000)
#We need the upper tail
Prob_Q33=1-(pnorm(979,1300,SD_Q33))
round(Prob_Q33,4)
## [1] 0.9458
#0.9458, or 94.58% probability that random sampled steer weighs more than 979lbs

#34.
#P(X>8340), where mu=11000, std = Sqrt(1960000)
SD_Q34=sqrt(1960000)
#We need the upper tail
Prob_Q34=1-(pnorm(8340,11000,SD_Q34))
round(Prob_Q34,4)
## [1] 0.9713
#0.9713, or 97.13% probability that a randomly selected monitor has a lifespan greater than 8340 hrs

#35. 
#High minus low and your good to go
#P(X>83m and X<85m, where mu=80, sd= 3m
Prob_Q35=pnorm(85,80,3)-(pnorm(83,80,3))
round(Prob_Q35,4)
## [1] 0.1109
#0.1109, or 11.09% of randomly selected firm incomes between 83-85m

#36. 
#Need X where P>14%, mu=456, std= 123
#We need the top 14%, upper tail = 14%
X_Q36=qnorm(0.14,456,123,FALSE)
round(X_Q36,0)
## [1] 589
#Only students who score higher than 589 will be eligible for tutoring roles

#37. 
#Need X, where 7%, and where 93%
#mu=6.13, std=0.06
Lower_Q37 = qnorm(0.07,6.13,0.06)
Upper_Q37 = qnorm(0.07,6.13,0.06,FALSE)
round(Lower_Q37,2) 
## [1] 6.04
round(Upper_Q37,2)
## [1] 6.22
#Nails smaller than 6,04cm or larger than 6.22cm should be rejected

#38.
#mu=78.8, std= 9.8
#Need to find X where P< 55% (top 45%) and P > 20%
#We know the mean is the 50% mark
Upper_C_Q38=qnorm(.55,78.8,9.8)
Lower_C_Q38=qnorm(.20,78.8,9.8)
round(Upper_C_Q38,0)
## [1] 80
round(Lower_C_Q38,0)
## [1] 71
#The numerical limits for a C in the English professors class are 71 and 80

#39. 
#Need any X, where P>45%
#mu=21.2, std=5.4
#Looking at the upper tail being 45% probability
Score_Q39 = qnorm(0.45, 21.2, 5.4, FALSE)
round(Score_Q39,1)
## [1] 21.9
#Students who score 21.9 or above on the ACT written composite test meet the minimum admission threshold

#40.
#P(X<11 | N=151, pi=0.09) 
Prob_Q40=pbinom(10,151,0.09)
round(Prob_Q40,4)
## [1] 0.192
#0.192, or 19.2% probability that fewer than 11 students will not graduate on time

#41. 
#Now we are dealing with central mean theory of sample distributions
#P( Xbar>48.83), where N= 147, mu=48, std=7
#Standard Error = Std / N = 7/sqrt(147)
SE_Q41=7/sqrt(147)
Prob_Q41=pnorm(48.83,48,SE_Q41,FALSE)
round(Prob_Q41,4)   
## [1] 0.0753
#0.0753, or 7.53% is the probability that the mean sample would be greater than 48.83

#42.
#P(xbar>93.54 months), where mu=91months, N=68, std=10
SE_Q42=10/sqrt(68)
Prob_Q42=pnorm(93.54,91,SE_Q42,FALSE)
round(Prob_Q42,4)
## [1] 0.0181
#0.0181, or 1.81% of sample means would have a mean life greater than 93.54 months

#43.
#P(4%<=No shows<=10%) 
N_Q43=540
#standard deviation = sqrt( no show proportion x show proportion)/N
Prop_Q43 = 0.07
std_Q43= (sqrt(0.07*0.93))/N_Q43
#Need the space between 4% and 10% no shows
Upper_Q43 = pnorm(0.1,0.07,std_Q43)
Lower_Q43 = pnorm(0.04,0.07,std_Q43)
round(Upper_Q43-Lower_Q43,4)
## [1] 1
#44.
#P(19%<=Defective bottles<=27%) 
N_Q44=602
#standard deviation = sqrt(defective bottles x non defective bottles)/N
std_Q44= (sqrt(0.23*0.67))/N_Q44
Upper_Q44 = pnorm(0.27,0.23,std_Q44)
Lower_Q44 = pnorm(0.19,0.023,std_Q44)
round(Upper_Q44-Lower_Q44,4)
## [1] 0
#45.
#mu = xbar +/- Z * SE
#SE = std / sqrt(n)
#xbar = 3.9, std = 0.8, N=208 
#80% confidence interval is two tailed 
xbar=3.9
SE_Q45 = 0.8/sqrt(208)
Upper_Q45 = xbar + qnorm(0.9)*SE_Q45
Lower_Q45 = xbar - qnorm(0.9)*SE_Q45
round(Upper_Q45,1)
## [1] 4
round(Lower_Q45,1)
## [1] 3.8
#Lower bound = 3.8
#Upper bound = 4.0

#46.
#mu = xbar +/- Z * SE
#SE = std / sqrt(n)
#xbar = 16.6, std = 11, N=7472 
#98% confidence interval is two tailed 
xbar=16.6
SE_Q46 = 11/sqrt(7472)
Upper_Q46 = xbar + qnorm(0.99)*SE_Q46
Lower_Q46 = xbar - qnorm(0.99)*SE_Q46
round(Upper_Q46,1)
## [1] 16.9
round(Lower_Q46,1)
## [1] 16.3
#Lower bound = 16.3 per capita income
#Upper bound = 16.9 per capita income


#47.
#Step 1 - the top right picture best represents the value of t such that 0.05 of the area is under the curve - one tail to the left
#Step 2
df_Q47=26-1
x_Q47=qt(0.05,df_Q47)
round(x_Q47,4)
## [1] -1.7081
#-1.7081

#48.
helium_gas_Q48=c(383.6, 347.1, 371.9, 347.6, 325.8, 337)
#Step1
xbar_Q48=mean(helium_gas_Q48)
round(xbar_Q48,2)
## [1] 352.17
#352.17 picocuries per liter

#Step 2
sd_Q48=sd(helium_gas_Q48)
round(sd_Q48,2)
## [1] 21.68
#21.68

#Step 3
#Because we do not have the standard deviation of the population, we will do a T-test for two tails ((1-0.9)/2= 0.05)
df_Q48=6-1
T_Q48=abs(qt(0.05,(df_Q48)))
round(T_Q48,3)
## [1] 2.015
#2.015

#Step 4
#mu range = xbar +/-  T-value * SE
SE_Q48=sd_Q48/(sqrt(6))
Upper_Q48=xbar_Q48 + T_Q48*SE_Q48
Lower_Q48=xbar_Q48 - T_Q48*SE_Q48
round(Upper_Q48,3)
## [1] 369.998
round(Lower_Q48,3)
## [1] 334.335
#Lower bound = 334.35
#upper bound = 369.998

#49.
#Step 1
#xbar=46.4 bushels/acre, N= 16, std of sample = 2.45  
#Because we do not have the standard deviation of the population, we will do a T-test for two tails ((1-0.8)/2= 0.1)
df_Q49=16-1
T_Q49=abs(qt(0.1,(df_Q49)))
abs(round(T_Q49,3))
## [1] 1.341
#1.341

#Step 2
#mu range = xbar +/-  T-value * SE
SE_Q49=2.45/(sqrt(16))
xbar_Q49=46.4
Upper_Q49=xbar_Q49 + T_Q49*SE_Q49
Lower_Q49=xbar_Q49 - T_Q49*SE_Q49
round(Upper_Q49,1)
## [1] 47.2
round(Lower_Q49,1)
## [1] 45.6
#Lower bound = 47.2
#upper bound = 45.6

#50. 
#What is N, when mu = 8  and SE <=0.13
std_Q50 = 1.9
#We know Standard Deviation, so will use Z as critical value
#Two tailed, 99% (means 0.01/2)
Z_Q50= qnorm(0.005)
#Margin of error = z * SE;  ME = Z * std/sqrt(N) 
ME_Q50 = 0.13
#ME^2 = Z^2*std^2/N
#N = ((Z^2*std^2)/ME^2)
N_Q50 = ((Z_Q50^2 * std_Q50^2)/ME_Q50^2)
round(N_Q50,0)
## [1] 1417
#1417 toys manufactured at a minimum

#51. 
#Want to know N
mu_Q51 =12.6
std_Q51 = sqrt(3.61)
 #We know Standard Deviation, so will use Z as critical value
#Two tailed, 95% (means 0.05/2)
Z_Q51= qnorm((1-0.95)/2)
ME_Q51=0.19
N_Q51 = ((Z_Q51^2 * std_Q51^2)/ME_Q51^2)
round(N_Q51,0)
## [1] 384
#384 samples of bacteria strands reproducing would be required to have a 85% confidence interval at ME given

#52.
#Step 1
sample_Q52=2089
below_level_Q52=sample_Q52-1734
Proportion_below_Q52 = below_level_Q52/sample_Q52
round(Proportion_below_Q52,3)
## [1] 0.17
#0.17 or 17% read at or below level 8th grade level

#Step 2
#point estimate +/- Z*SE
Z_Q52 = qnorm(0.01)
#SE=sqrt((calc proportion below 8th * proportion above))/Sample Size
SE_Q52 = sqrt(Proportion_below_Q52*(1-Proportion_below_Q52))/sample_Q52
upper_Q52=Proportion_below_Q52+qnorm(0.01)*SE_Q52
lower_Q52=Proportion_below_Q52-qnorm(0.01)*SE_Q52
round(upper_Q52,3)
## [1] 0.17
round(lower_Q52,3)
## [1] 0.17
#53.
#Step 1
sample_Q53=474
spills_Q53=156
Proportion_spills_Q53 = spills_Q53/sample_Q53
round(Proportion_spills_Q53,3)
## [1] 0.329
#0.329 or 32.9% of sampled tankers had spills

#Step 2
#point estimate +/- Z*SE
#Unclear how to determine the SE
#SE=sqrt((calc proportion spill* proportion not spill))/Sample Size
SE_Q53 = sqrt(Proportion_spills_Q53*(1-Proportion_spills_Q53))/sample_Q53
upper_Q53=Proportion_spills_Q53+qnorm(0.025)*SE_Q53
lower_Q53=Proportion_spills_Q53-qnorm(0.025)*SE_Q53
round(upper_Q53,3)
## [1] 0.327
round(lower_Q53,3)
## [1] 0.331
#lower bound = 0.327
#upper bound = 0.331