prob= dnorm(0:10, means, norm_d)) sum(prob)
pnorm(10, (55*21), (sqrt(norm_d)
probs= dbinom(0:55,55,.21)
barplot(probs, main, col)
#Math 410
#1. Using the data on pulse rate in Digital Appendix 3 (‘Student Data’), construct a #separate histogram #for the pulse rate of females and males. Be sure you use the same scale for both #histograms. Are #the distributions similar? If not, how do they differ? Also, create two box plots #which compare the #male and female data. What information can you infer from the box plots?
student_data = read.csv(file.choose())
# histogram
male= subset(student_data, student_data$Gender =="male" )
female=subset(student_data, student_data$Gender =="female")
hist(female$Pulse, breaks = 20, main= "Female Pulse Rate", col="blue", xlim=c(20, 120)
,ylim=c(0,20))
hist(male$Pulse, breaks = 20, main= "Male Pulse Rate", col="green", xlim=c(20, 120)
,ylim=c(0,20))
# the female pulse rates seem relavtively symmetrical and show a peak around 80
# while the male pulse rates same bimodal. The female pulse rates seem to be higher
# than males
##boxplot
boxplot(female$Pulse, col="green",
ylab="Pulse Rate (bpm)",
main="Female Pulse Rates" )
boxplot(male$Pulse, col="blue",
ylab="Pulse Rate (bpm)",
main="Male Pulse Rates" )
boxplot(female$Pulse, male$Pulse, col=c("blue","green" ),
names=c("Male", "Female"), ylab="Pulse Rate (bpm)",
main="Comparison of Pulse Rates by Gender" )
# From the boxplots we can see that males have a higher median and more outliers
# than females.
#Construct a frequency distribution table and histogram from the data using an appropriate bin size. #Ensure both the table and graph are displayed in accordance with the rules described in Chapter 3.
freq_data = read.csv(file.choose())
#freq_data
bass_length=freq_data$length.mm.
#bass_length
min(bass_length)
max(bass_length)
bins=seq(0,500, by=50)
bins
cut=cut(bass_length, bins)
table=transform(table(cut))
table
# histogram
hist(bass_length, breaks=10, main="Bass Lengths(mm)", xlab = "Length (mm)",
ylab = "Frequency", col = "violet" )
#3. Expand the frequency distribution table generated in (2) above to include a new column for #cumulative frequencies. Display the cumulative frequencies in both histogram and frequency #polygon format using the R command freq.pol. cum_sum = cumsum(table(cut)) cum_sum x = 1:length(cum_sum) # or use levels(cut) for actual bin labels
barplot(cum_sum,
names.arg = x, main = "Bass Lengths (mm)", xlab = "Length (mm)",
ylab = "Cumulative Frequency", col = "springgreen4")
Library(LearningStats)
freq.pol(bass_length, main="Cumulative Frequency Polygon")
#4. A random sample of 42 belted kingfishers was collected from various locations in North America #and their bill lengths were measured in mm. The collected data is contained in the file labeled #‘kingfisher.csv’. Display a boxplot of the data. Compute the following statistics: mean, median, #mode, range, interquartile range, 5 #th percentile, 95 #th percentile, variance, standard deviation, and #coefficient of variation. #Question 5 kingfisher=read.csv(file.choose()) boxplot(kingfisher$Bill_length, col=“steelblue”, ylab=“Length(units)”,main=“Kingfisher Boxplot”)
mean(kingfisher$Bill_length)
median(kingfisher$Bill_length)
sd(kingfisher$Bill_length)
var(kingfisher$Bill_length)
IQR(kingfisher$Bill_length)
#mode: 56.5, 58.5. Each repeated 3 times
tablekingfish=table(kingfisher$Bill_length)
tablekingfish
range(kingfisher$Bill_length)
66-41.6
#range is 24.4
# 5th and 95th percentile
quantile(kingfisher$Bill_length, probs=c(0.05, 0.95))
# coefficient of variance
CV= (sd(kingfisher$Bill_length)/mean(kingfisher$Bill_length))*100
CV
#5. Select five simple random samples of 25, 50, and 100 male mosquitofish lengths from the data in #the Mosquitofish.csv file. Compute the mean and standard deviation for these samples. Consider #the measurements in the file to be the entire population of interest with a population mean of 28.8 #mm and a population standard deviation of 6.84 mm. Are the means and standard deviations of #your samples the same as the population mean and standard deviation? How do you account for #any difference? What trends are you seeing?
length_data = read.csv(file.choose())
male_lengths = length_data$FishLength[length_data$Gender == "M"]
male_lengths
# sample of 25
sum_mean = 0
sum_sd = 0
for (i in 1:5) {
sample_data = sample(male_lengths, 25, replace = FALSE)
sum_mean = sum_mean + mean(sample_data)
sum_sd = sum_sd + sd(sample_data)
}
avg_mean_25 = sum_mean / 5
avg_sd_25 = sum_sd / 5
avg_mean_25
avg_sd_25
# sample 50
sum_mean = 0
sum_sd = 0
for (i in 1:5) {
sample_data = sample(male_lengths, 50, replace = FALSE)
sum_mean = sum_mean + mean(sample_data)
sum_sd = sum_sd + sd(sample_data)
}
avg_mean_50 = sum_mean / 5
avg_sd_50 = sum_sd / 5
avg_mean_50
avg_sd_50
# sample 100
sum_mean = 0
sum_sd = 0
for (i in 1:5) {
sample_data = sample(male_lengths, 100, replace = FALSE)
sum_mean = sum_mean + mean(sample_data)
sum_sd = sum_sd + sd(sample_data)
}
avg_mean_100 = sum_mean / 5
avg_sd_100 = sum_sd / 5
avg_mean_100
avg_sd_100
#The standard deviation and sample means are much lower than the population mean
#and standard deviation, likely because we are sampling only from males and not
#females as well. The male fish likely have smaller average lengths than females
# and thus differ in the mean and sd.
#ability to interact socially. According to Christensen et. al. (2016), about 1 in every 68 children #born in the United States has ASD at the age of eight years. Suppose families consist of three #children. We will assume that the occurrence of children with ASD are independent events within #each family. #a. Determine the binomial distribution for 𝑥 = 0, 1, 2, and 3 children with ASD and display #the results in both tabular and graphical format. #b. Next, determine the probability that two or more children in a family of four have ASD. #c. Finally, imagine that a small city has 12,000 children. What is the expected number of #children with ASD?
#a
pvalue= 1.0/68
pvalue
ntrials= 3
prob =dbinom(0:ntrials, ntrials, pvalue)
prob
sum(prob)
x=c(0,1,2,3)
barplot(prob, names.arg=x, main="ASD Probability Distribution",
xlab="Number of People", col="sienna")
#b
prob =dbinom(2:ntrials, ntrials, pvalue)
prob
sum(prob)
#c
expval=pvalue*12000
expval
#quadrats and their associated frequencies. Using the appropriate Chi-Square test, determine #whether or not the seedlings are randomly distributed in the sampled habitat. Recall that when items #are randomly distributed, they tend to follow a Poisson distribution. Be sure the assumption of the #test you select are met. #Number of Plants / Quadrat Frequency #H0: seedlings follow poisson distribution (are randomly distributed) #HA: seedlings don’t follow poisson distribution (are not randomly distributed)
# assumptions
# goodness of fit;
# - data are freq in mutually exclusive categories
# - objects are independent
# - each category has large enough expected frequency
plants=c(0,1,2,3,4,5,6)
freqs=c(35,28,15,10,7,5,0)
sum(plants* freqs)/100
probs=dpois(0:5, 1.41) #larger and then taper off
probs
#1.41 is the avg number of plants per quadrant
sum(probs)
probs[7]=1-sum(probs)
probs
sum(probs)
expected = probs * 100
expected
observed = c(freqs[1], freqs[2], freqs[3], freqs[4],
freqs[5] + freqs[6] + freqs[7]) # 7+5+0 = 12
observed
expected_combined = c(expected[1], expected[2], expected[3], expected[4],
expected[5] + expected[6] + expected[7])
expected_combined
chisq_stat = sum((observed - expected_combined)^2 / expected_combined)
df = length(observed) - 1
p_value = pchisq(chisq_stat, df, lower.tail = FALSE)
chisq_stat
p_value
#p-value < alpha of 0.05
#reject H0
# conclusion =seedlings don't follow poisson and are not randomly distributed
#8. One of the largest health experiments ever conducted investigated the effectiveness of the Salk #vaccine in preventing paralysis and death from poliomyelitis. Test the null hypothesis that the #treatment and rate of polio are independent using the appropriate Chi-Square test. Be sure the #assumptions of the test you select are met. #Treatment Number with Polio Number without Polio #Salk Vaccine 57 200,688 #Placebo Control 142 201,087 #test of association # assumptions # - Data are frequencies placed into mutually exclusive cells. # - Observations are independent of one another. # - Not more than 20% of the cells may have an expected value of < 5.00, and no # cell may have an expected value < 1.00 # - For a 2 x 2 contingency table, all cells must have an expected value of 5.00 #or greater. # might have to look at fischer exact test
#H0: vaccine has no effect on the rate of polio
#HA: vaccine does have an effect on the rate of polio
R1=c(57, 200688)
R2=c(142, 201087)
polio= matrix(c(R1,R2), nrow =2, byrow=TRUE)
chisq.test(polio, correct=FALSE)
#p-value < alpha of 0.05
#reject H0
# conclusion= vaccine does have an effect on the rate of polio
#9. Using the data in the eggs.csv file, determine if the eggs in ounces tend to follow a normal distribution. #a. Construct a histogram and compare its shape to a normal curve. #b. Create a quantile plot. Does the plotted data deviate from a straight line? # c. Compute the values of skewness and kurtosis. What are the expected values of skewness and #kurtosis for a normal distribution? Are your computed values close to the expectation? # d. Finally, run a Shapiro-Wilk normality test. #e. Can you conclude that this sample follows a normal distribution?
eggs=read.csv(file.choose()) #eggs eggdata=eggs$Egg.Sizes hist(eggdata, col=“red”, main=“Histogram of Fish Lengths in mm”, xlab=“Fish Length (mm)”)
qqnorm(eggdata) qqline(eggdata) # it deviates slightly at the tail but not in the middle
library(moments)
skewness(eggdata) # close to expected value kurtosis(eggdata) # close to expected value
#H0: The egg sizes are normally distributed #HA: The egg sizes is not normally distributed shapiro.test(eggdata)
#p-value<< 0.05 #reject the null hypothesis of normality #The egg sizes are not normally distributed
10. According to the National Center for Health Statistics for 2015, 21% of adults in Pennsylvania
#smoke. We wish to know the probability of selecting a random sample of 55 people containing 10 #or fewer smokers from this population. #a. Use a statistical program to determine the exact probability of getting 10 or fewer smokers. #b. Now use the normal approximation of the binomial distribution to answer the same question. #Compare your answers. Are they similar?
#standard deviation of 15 meters. Find a 95% confidence interval for the average height of mature #Redwoods. For the critical 𝑡 value, first estimate it from the table in the text (Appendix A.2) using #interpolation. Then use R to get the exact value. How close are they? t=qt(.05/2, 84, lower.tail=F ) # critical t value t # df is 84 which is the sample minus 1
se=15/sqrt(85) # standard error se
upper= 225+tse # mean sample plus tvalue se upper
lower= 225-tse # mean sample minus tvalue se lower # mean would fall between these 2 limits
#The mean length 𝑥 was 50.0 mm and the standard deviation 𝑠 was 10.0 mm. Compute the 95% #confidence interval for the mean tail length 𝜇. Next compute the 90% confidence interval. Interpret #what these confidence intervals represent.
t=qt(.05/2, 23, lower.tail=F) #critical t value t
se= 10/sqrt(24) se
upper= 50+tse # mean sample plus tvalue se upper
lower= 50-tse # mean sample minus tvalue se lower
#90% confidence interval
t=qt(.1/2, 23, lower.tail=F) t
se= 10/sqrt(24) se
upper= 50+tse # mean sample plus tvalue se upper
lower= 50-tse # mean sample minus tvalue se lower
#13. The label on a company’s energy drinks claims that they contain an average caffeine concentration #of 255 mg/oz. The mean concentration of 22 randomly sampled drinks was 271, with a standard #deviation of 12.1. Are the drink labels accurate?
#H0: mean caffiene concentration is equal to 255 mg/oz #HA: mean caffiene concentration is NOT equal to 255 mg/oz
#1. sample is random sample from population of interest (true) #2. variation consists of contain measurements taken from normal distribution or sample size are #large enough so mean is norm distr true mu0= 255
drink_samples=rnorm(22, 271, 12.1 ) # need to generate random sample from #normal distribution drink_samples # 22 deviates from random distribution
mean(drink_samples) sd(drink_samples)
t.test(drink_samples, mu=mu0) # mean is outside the interval
t=qt(0.05/2, 21, lower.tail=F) #Critical t Value t
#p-value< 0.05 #reject H0
#follow a normal distribution. A limnologist would like to know whether stock ponds tend to have #lower hardness. He collects water from 25 randomly selected stock ponds, which yielded the #following results. Test the appropriate null hypothesis. #346 496 352 378 315 420 485 446 479 422 #494 289 436 516 615 491 360 385 500 558 #381 303 434 562 496
#H0: mean water hardness is equal to 452 mg/L #HA: mean water hardness is less than 452 mg/L wtr_samples=c(346, 496, 352, 378, 315, 420, 485, 446, 479, 422, 494, 289, 436, 516, 615, 491, 360, 385, 500, 558, 381, 303, 434, 562, 496)
mu0= 452
mean(wtr_samples) sd(wtr_samples) t.test(wtr_samples, mu=mu0)
#15. A farmer is growing two varieties of hot peppers. He grows the plants under the same conditions, #and matches peppers from each plant harvested at the same time together. The heat of the peppers #from each plant, in Scoville units, is recorded in peppers.csv. Looking over the data, the farmer #concludes the peppers from plant A are hotter than those from plant B. Use the appropriate test to #evaluate this claim. For this problem, you may assume the distribution of pepper heat is normally #distributed. #H0: The mean heat of the peppers from Plants A and B are the same #HA: The mean heat of the peppers from PLants A and B are not the same
#assumptions paired t test #1. measurment is continuous or the range of possible values is large #2. matched pairs are measured #3. the distribution of difference is approx normal #4. data constitute a random sample from pop of interest
pepper=read.csv(file.choose())
a=pepper\(PlantA b=pepper\)PlantB mean(a) mean(b)
df=length(a)-1 df
crit_value= qt(0.5/2, df, lower.tail=F) crit_value
t.test(a, b, paired=T)
#t statistic > t critical # reject H0 # pvalue < alpha 0.05 # reject H0 # mean heat values are not the same for plants A and B
#16. In a clinical trial conducted to evaluate the effectiveness of a new pain relief medication, 10 patients #were given the medication and rated their pain level on a scale of 1 to 10 both before and after #taking the drug. Each patient contributes a pair of datapoints consisting of discrete numeric #variables, the sample size is small, and normality cannot be assumed. Use the appropriate test to
#h0: pain level is equal before and after treatment #hA: pain level is not equal before and after treatment
#The assumptions of the Wilcoxon Signed Rank Test: #1.The data from which the differences are calculated is based on a random sample from #the population and with independent errors. #2.The differences are symmetric about the median.
pain=read.csv(file.choose())
before=pain\(BeforeRating after=pain\)AfterRating
before after
df= length(before)-1 df
crit_value= qt(0.05/2, df, lower.tail = F) crit_value
wilcox.test(before, after, paired=T)
analyze the differences between the pairs. The data is contained in the file PainRatings.csv. #17. During the summer, 12 athletes were given a new sports beverage. Fifteen minutes after drinking i#t, each athlete was asked if he felt better than, worse than, or the same as before he drank the ##beverage. Nine reported feeling better, one reported feeling worse, and two reported no change. #We wish to know if drinking this beverage affects how athletes feel.
binom.test(9, 10) # succeses, total (for successes and failures, exclude no reactions)
#H0: new sports beverage has no effect on how athletes feel #HA: new sports beverage has an effect on how athletes feel # p-value = 0.02148 # p-value < 0.05 # reject # new sports drink has an impact on how athletes