prob= dnorm(0:10, means, norm_d)) sum(prob)

b

pnorm(10, (55*21), (sqrt(norm_d)

      probs= dbinom(0:55,55,.21)
      barplot(probs, main, col)

#Math 410

Sample Final Exam

#1. Using the data on pulse rate in Digital Appendix 3 (‘Student Data’), construct a #separate histogram #for the pulse rate of females and males. Be sure you use the same scale for both #histograms. Are #the distributions similar? If not, how do they differ? Also, create two box plots #which compare the #male and female data. What information can you infer from the box plots?

      student_data = read.csv(file.choose())
      
      # histogram
      
      male= subset(student_data, student_data$Gender =="male" )
      female=subset(student_data, student_data$Gender =="female")
      hist(female$Pulse, breaks = 20, main= "Female Pulse Rate", col="blue", xlim=c(20, 120)
           ,ylim=c(0,20))
      hist(male$Pulse, breaks = 20, main= "Male Pulse Rate", col="green", xlim=c(20, 120)
           ,ylim=c(0,20))
      # the female pulse rates seem relavtively symmetrical and show a peak around 80
      # while the male pulse rates same bimodal. The female pulse rates seem to be higher
      # than males
      
      ##boxplot
      
      boxplot(female$Pulse, col="green",
              ylab="Pulse Rate (bpm)",
              main="Female Pulse Rates" )
      
      boxplot(male$Pulse, col="blue",
              ylab="Pulse Rate (bpm)",
              main="Male Pulse Rates" )
      
      boxplot(female$Pulse, male$Pulse, col=c("blue","green" ),
              names=c("Male", "Female"), ylab="Pulse Rate (bpm)",
              main="Comparison of Pulse Rates by Gender" )
      # From the boxplots we can see that males have a higher median and more outliers
      # than females.

2. The file named ‘bass.csv’ contains the lengths (in millimeters) of a sample of 100 largemouth bass.

#Construct a frequency distribution table and histogram from the data using an appropriate bin size. #Ensure both the table and graph are displayed in accordance with the rules described in Chapter 3.

      freq_data = read.csv(file.choose())
      #freq_data
      bass_length=freq_data$length.mm.
      #bass_length
      min(bass_length)
      max(bass_length)
      bins=seq(0,500, by=50)
      bins
      cut=cut(bass_length, bins)
      table=transform(table(cut))
      table
      
      # histogram
      hist(bass_length, breaks=10, main="Bass Lengths(mm)", xlab = "Length (mm)",
           ylab = "Frequency", col = "violet" )

#3. Expand the frequency distribution table generated in (2) above to include a new column for #cumulative frequencies. Display the cumulative frequencies in both histogram and frequency #polygon format using the R command freq.pol. cum_sum = cumsum(table(cut)) cum_sum x = 1:length(cum_sum) # or use levels(cut) for actual bin labels

      barplot(cum_sum,
              names.arg = x, main = "Bass Lengths (mm)", xlab = "Length (mm)",
              ylab = "Cumulative Frequency", col = "springgreen4")
      Library(LearningStats)
      freq.pol(bass_length, main="Cumulative Frequency Polygon")

#4. A random sample of 42 belted kingfishers was collected from various locations in North America #and their bill lengths were measured in mm. The collected data is contained in the file labeled #‘kingfisher.csv’. Display a boxplot of the data. Compute the following statistics: mean, median, #mode, range, interquartile range, 5 #th percentile, 95 #th percentile, variance, standard deviation, and #coefficient of variation. #Question 5 kingfisher=read.csv(file.choose()) boxplot(kingfisher$Bill_length, col=“steelblue”, ylab=“Length(units)”,main=“Kingfisher Boxplot”)

      mean(kingfisher$Bill_length)
      
      median(kingfisher$Bill_length)
      
      sd(kingfisher$Bill_length)
      
      var(kingfisher$Bill_length)
      
      IQR(kingfisher$Bill_length)
      
      #mode: 56.5, 58.5. Each repeated 3 times
      tablekingfish=table(kingfisher$Bill_length)
      tablekingfish
      
      range(kingfisher$Bill_length)
      66-41.6
      #range is 24.4
      
      # 5th and 95th percentile
      quantile(kingfisher$Bill_length, probs=c(0.05, 0.95))
      
      # coefficient of variance
      CV= (sd(kingfisher$Bill_length)/mean(kingfisher$Bill_length))*100
      CV

#5. Select five simple random samples of 25, 50, and 100 male mosquitofish lengths from the data in #the Mosquitofish.csv file. Compute the mean and standard deviation for these samples. Consider #the measurements in the file to be the entire population of interest with a population mean of 28.8 #mm and a population standard deviation of 6.84 mm. Are the means and standard deviations of #your samples the same as the population mean and standard deviation? How do you account for #any difference? What trends are you seeing?

      length_data = read.csv(file.choose())
      male_lengths = length_data$FishLength[length_data$Gender == "M"]
      male_lengths
      # sample of 25
      sum_mean = 0
      sum_sd = 0
      
      for (i in 1:5) {
        sample_data = sample(male_lengths, 25, replace = FALSE)
        sum_mean = sum_mean + mean(sample_data)
        sum_sd = sum_sd + sd(sample_data)
      }
      
      avg_mean_25 = sum_mean / 5
      avg_sd_25 = sum_sd / 5
      
      avg_mean_25
      avg_sd_25
      
      # sample 50
      sum_mean = 0
      sum_sd = 0
      
      for (i in 1:5) {
        sample_data = sample(male_lengths, 50, replace = FALSE)
        sum_mean = sum_mean + mean(sample_data)
        sum_sd = sum_sd + sd(sample_data)
      }
      
      avg_mean_50 = sum_mean / 5
      avg_sd_50 = sum_sd / 5
      
      avg_mean_50
      avg_sd_50
      
      # sample 100
      sum_mean = 0
      sum_sd = 0
      
      for (i in 1:5) {
        sample_data = sample(male_lengths, 100, replace = FALSE)
        sum_mean = sum_mean + mean(sample_data)
        sum_sd = sum_sd + sd(sample_data)
      }
      
      avg_mean_100 = sum_mean / 5
      avg_sd_100 = sum_sd / 5
      
      avg_mean_100
      avg_sd_100
      
      #The standard deviation and sample means are much lower than the population mean
      #and standard deviation, likely because we are sampling only from males and not
      #females as well. The male fish likely have smaller average lengths than females
      # and thus differ in the mean and sd.

6. Autism Spectrum Disorders (ASD) are a group of developmental disorders that affect a person’s

#ability to interact socially. According to Christensen et. al. (2016), about 1 in every 68 children #born in the United States has ASD at the age of eight years. Suppose families consist of three #children. We will assume that the occurrence of children with ASD are independent events within #each family. #a. Determine the binomial distribution for 𝑥 = 0, 1, 2, and 3 children with ASD and display #the results in both tabular and graphical format. #b. Next, determine the probability that two or more children in a family of four have ASD. #c. Finally, imagine that a small city has 12,000 children. What is the expected number of #children with ASD?

      #a
      pvalue= 1.0/68
      pvalue
      ntrials= 3
      
      prob =dbinom(0:ntrials, ntrials, pvalue)
      prob
      sum(prob)
      
      x=c(0,1,2,3)
      barplot(prob, names.arg=x, main="ASD Probability Distribution", 
              xlab="Number of People", col="sienna")
      
      #b
      prob =dbinom(2:ntrials, ntrials, pvalue)
      prob
      sum(prob)
      
      #c
      
      expval=pvalue*12000
      expval

7. The table below shows the number of maple seedlings that were present in 100 square meter

#quadrats and their associated frequencies. Using the appropriate Chi-Square test, determine #whether or not the seedlings are randomly distributed in the sampled habitat. Recall that when items #are randomly distributed, they tend to follow a Poisson distribution. Be sure the assumption of the #test you select are met. #Number of Plants / Quadrat Frequency #H0: seedlings follow poisson distribution (are randomly distributed) #HA: seedlings don’t follow poisson distribution (are not randomly distributed)

      # assumptions
      # goodness of fit;
      # - data are freq in mutually exclusive categories
      # - objects are independent
      # - each category has large enough expected frequency
      
      plants=c(0,1,2,3,4,5,6)
      freqs=c(35,28,15,10,7,5,0)
      sum(plants* freqs)/100
      probs=dpois(0:5, 1.41) #larger and then taper off
      probs
      #1.41 is the avg number of plants per quadrant
      sum(probs)
      probs[7]=1-sum(probs)
      probs
      sum(probs)
      expected = probs * 100
      expected
      observed = c(freqs[1], freqs[2], freqs[3], freqs[4],
                   freqs[5] + freqs[6] + freqs[7])  # 7+5+0 = 12
      observed
      
      expected_combined = c(expected[1], expected[2], expected[3], expected[4],
                            expected[5] + expected[6] + expected[7])
      expected_combined
      
      chisq_stat = sum((observed - expected_combined)^2 / expected_combined)
      df = length(observed) - 1 
      p_value = pchisq(chisq_stat, df, lower.tail = FALSE)
      
      chisq_stat
      p_value
      
      
      #p-value < alpha of 0.05
      #reject H0
      # conclusion =seedlings don't follow poisson and are not randomly distributed

#8. One of the largest health experiments ever conducted investigated the effectiveness of the Salk #vaccine in preventing paralysis and death from poliomyelitis. Test the null hypothesis that the #treatment and rate of polio are independent using the appropriate Chi-Square test. Be sure the #assumptions of the test you select are met. #Treatment Number with Polio Number without Polio #Salk Vaccine 57 200,688 #Placebo Control 142 201,087 #test of association # assumptions # - Data are frequencies placed into mutually exclusive cells. # - Observations are independent of one another. # - Not more than 20% of the cells may have an expected value of < 5.00, and no # cell may have an expected value < 1.00 # - For a 2 x 2 contingency table, all cells must have an expected value of 5.00 #or greater. # might have to look at fischer exact test

      #H0: vaccine has no effect on the rate of polio
      #HA: vaccine does have an effect on the rate of polio
      
      R1=c(57, 200688)
      R2=c(142, 201087)
      
      polio= matrix(c(R1,R2), nrow =2, byrow=TRUE)
      chisq.test(polio, correct=FALSE)
      
      #p-value < alpha of 0.05
      #reject H0
      # conclusion= vaccine does have an effect on the rate of polio

#9. Using the data in the eggs.csv file, determine if the eggs in ounces tend to follow a normal distribution. #a. Construct a histogram and compare its shape to a normal curve. #b. Create a quantile plot. Does the plotted data deviate from a straight line? # c. Compute the values of skewness and kurtosis. What are the expected values of skewness and #kurtosis for a normal distribution? Are your computed values close to the expectation? # d. Finally, run a Shapiro-Wilk normality test. #e. Can you conclude that this sample follows a normal distribution?

eggs=read.csv(file.choose()) #eggs eggdata=eggs$Egg.Sizes hist(eggdata, col=“red”, main=“Histogram of Fish Lengths in mm”, xlab=“Fish Length (mm)”)

qqnorm(eggdata) qqline(eggdata) # it deviates slightly at the tail but not in the middle

library(moments)

skewness(eggdata) # close to expected value kurtosis(eggdata) # close to expected value

#H0: The egg sizes are normally distributed #HA: The egg sizes is not normally distributed shapiro.test(eggdata)

#p-value<< 0.05 #reject the null hypothesis of normality #The egg sizes are not normally distributed

10. According to the National Center for Health Statistics for 2015, 21% of adults in Pennsylvania

#smoke. We wish to know the probability of selecting a random sample of 55 people containing 10 #or fewer smokers from this population. #a. Use a statistical program to determine the exact probability of getting 10 or fewer smokers. #b. Now use the normal approximation of the binomial distribution to answer the same question. #Compare your answers. Are they similar?

11. Suppose that in a sample of 85 mature Redwood trees, the average height is 225 meters with a

#standard deviation of 15 meters. Find a 95% confidence interval for the average height of mature #Redwoods. For the critical 𝑡 value, first estimate it from the table in the text (Appendix A.2) using #interpolation. Then use R to get the exact value. How close are they? t=qt(.05/2, 84, lower.tail=F ) # critical t value t # df is 84 which is the sample minus 1

60 df, tcrit=2.0

100 df, tcrit= 1.984

se=15/sqrt(85) # standard error se

upper= 225+tse # mean sample plus tvalue se upper

lower= 225-tse # mean sample minus tvalue se lower # mean would fall between these 2 limits

12. A random sample of 24 tweetie birds was collected and the tail length of each bird was measured.

#The mean length 𝑥 was 50.0 mm and the standard deviation 𝑠 was 10.0 mm. Compute the 95% #confidence interval for the mean tail length 𝜇. Next compute the 90% confidence interval. Interpret #what these confidence intervals represent.

95% confidence interval

assume t distribution

t=qt(.05/2, 23, lower.tail=F) #critical t value t

apendix value = 2.069

se= 10/sqrt(24) se

upper= 50+tse # mean sample plus tvalue se upper

lower= 50-tse # mean sample minus tvalue se lower

#90% confidence interval

t=qt(.1/2, 23, lower.tail=F) t

apendix val= 1.714

se= 10/sqrt(24) se

upper= 50+tse # mean sample plus tvalue se upper

lower= 50-tse # mean sample minus tvalue se lower

#13. The label on a company’s energy drinks claims that they contain an average caffeine concentration #of 255 mg/oz. The mean concentration of 22 randomly sampled drinks was 271, with a standard #deviation of 12.1. Are the drink labels accurate?

#H0: mean caffiene concentration is equal to 255 mg/oz #HA: mean caffiene concentration is NOT equal to 255 mg/oz

assumptions

#1. sample is random sample from population of interest (true) #2. variation consists of contain measurements taken from normal distribution or sample size are #large enough so mean is norm distr true mu0= 255

drink_samples=rnorm(22, 271, 12.1 ) # need to generate random sample from #normal distribution drink_samples # 22 deviates from random distribution

mean(drink_samples) sd(drink_samples)

t.test(drink_samples, mu=mu0) # mean is outside the interval

t=qt(0.05/2, 21, lower.tail=F) #Critical t Value t

#p-value< 0.05 #reject H0

t stat > 2.080

reject H0

conclusion: mean caffeine concentration is not equal to 255 mg/oz

14. Suppose that the mean water hardness of lakes in Kansas is 452 mg/L and these values tend to

#follow a normal distribution. A limnologist would like to know whether stock ponds tend to have #lower hardness. He collects water from 25 randomly selected stock ponds, which yielded the #following results. Test the appropriate null hypothesis. #346 496 352 378 315 420 485 446 479 422 #494 289 436 516 615 491 360 385 500 558 #381 303 434 562 496

#H0: mean water hardness is equal to 452 mg/L #HA: mean water hardness is less than 452 mg/L wtr_samples=c(346, 496, 352, 378, 315, 420, 485, 446, 479, 422, 494, 289, 436, 516, 615, 491, 360, 385, 500, 558, 381, 303, 434, 562, 496)

mu0= 452

mean(wtr_samples) sd(wtr_samples) t.test(wtr_samples, mu=mu0)

reject null hypothesis

the mean of water samples is less than 452 mg/L

#15. A farmer is growing two varieties of hot peppers. He grows the plants under the same conditions, #and matches peppers from each plant harvested at the same time together. The heat of the peppers #from each plant, in Scoville units, is recorded in peppers.csv. Looking over the data, the farmer #concludes the peppers from plant A are hotter than those from plant B. Use the appropriate test to #evaluate this claim. For this problem, you may assume the distribution of pepper heat is normally #distributed. #H0: The mean heat of the peppers from Plants A and B are the same #HA: The mean heat of the peppers from PLants A and B are not the same

#assumptions paired t test #1. measurment is continuous or the range of possible values is large #2. matched pairs are measured #3. the distribution of difference is approx normal #4. data constitute a random sample from pop of interest

pepper=read.csv(file.choose())

a=pepper$PlantA b=pepper$PlantB mean(a) mean(b)

df=length(a)-1 df

crit_value= qt(0.5/2, df, lower.tail=F) crit_value

t.test(a, b, paired=T)

t statistic = 2.262157

#t statistic > t critical # reject H0 # pvalue < alpha 0.05 # reject H0 # mean heat values are not the same for plants A and B

#16. In a clinical trial conducted to evaluate the effectiveness of a new pain relief medication, 10 patients #were given the medication and rated their pain level on a scale of 1 to 10 both before and after #taking the drug. Each patient contributes a pair of datapoints consisting of discrete numeric #variables, the sample size is small, and normality cannot be assumed. Use the appropriate test to

#h0: pain level is equal before and after treatment #hA: pain level is not equal before and after treatment

#The assumptions of the Wilcoxon Signed Rank Test: #1.The data from which the differences are calculated is based on a random sample from #the population and with independent errors. #2.The differences are symmetric about the median.

pain=read.csv(file.choose())

before=pain$BeforeRating after=pain$AfterRating

before after

df= length(before)-1 df

crit_value= qt(0.05/2, df, lower.tail = F) crit_value

wilcox.test(before, after, paired=T)

p-value = 0.04123

p-value< 0.05

reject H0

treatment does not change pain level

analyze the differences between the pairs. The data is contained in the file PainRatings.csv. #17. During the summer, 12 athletes were given a new sports beverage. Fifteen minutes after drinking i#t, each athlete was asked if he felt better than, worse than, or the same as before he drank the ##beverage. Nine reported feeling better, one reported feeling worse, and two reported no change. #We wish to know if drinking this beverage affects how athletes feel.

binom.test(9, 10) # succeses, total (for successes and failures, exclude no reactions)

#H0: new sports beverage has no effect on how athletes feel #HA: new sports beverage has an effect on how athletes feel # p-value = 0.02148 # p-value < 0.05 # reject # new sports drink has an impact on how athletes